# Comments:
All code that is commented out, is for the purpose of not running the web scrape again on the reddit API

Also csv save points are commented out as well to prevent overwriting save points

In [1]:
# Imports
import pandas as pd
import random
import numpy as np
import requests
import pickle
import xgboost as xgb
import time
import datetime as dt
import regex as re
from bs4 import BeautifulSoup
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn import feature_selection
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import os
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import warnings
warnings.filterwarnings('ignore')

# Webscraping

In [None]:
# function for scraping web (Provided by Josh Robin)
# This is a better web scrape function allows to skip = days, and times = amount to run

# this function was slightly modified from Brian Collins' lecture
def query_pushshift(subreddit, kind='submission', skip=30, times=30, 
                    subfield = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self'],
                    comfields = ['body', 'score', 'created_utc']):

    stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size=500".format(kind, subreddit)
    mylist = []
    
    for x in range(1, times):
        
        URL = "{}&after={}d".format(stem, skip * x)
        print(URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        mylist.append(df)
        time.sleep(2)
        
    full = pd.concat(mylist, sort=False)
    
    if kind == "submission":
        
        full = full[subfield]
        
        full = full.drop_duplicates()
        
        full = full.loc[full['is_self'] == True]
        
    def get_date(created):
        return dt.date.fromtimestamp(created)
    
    _timestamp = full["created_utc"].apply(get_date)
    
    full['timestamp'] = _timestamp

    print(full.shape)
    
    return full

In [None]:
# Scraping Reddit website subreddit Fantasy Hockey

# df_fan = query_pushshift('fantasyhockey', times = 40)

In [None]:
# Chcecking unique posts within self text (post body)
len(df_fan['selftext'].unique())

In [None]:
# Scraping Reddit website subreddit Official NHL

# df_nhl = query_pushshift('nhl', times = 80)

In [None]:
# Checking unique values within NHL subreddit
len(df_nhl['selftext'].unique())

In [None]:
# Saving raw scrap csv data
# df_fan.to_csv('./fan_raw.csv')
# df_nhl.to_csv('./nhl_raw.csv')

# Data Cleaning

In [None]:
# Clearing all columns except 'selftext', 'title', 'subreddit'
cols = ['selftext', 'title', 'subreddit']
df_fan = df_fan[cols]
df_nhl = df_nhl[cols]

In [None]:
# Checking dataframe changes for Fantasy Hockey
df_fan.head()

In [None]:
# Checking dataframe changes for Official NHL
df_nhl.head()

In [None]:
# # Using RegEx ro remove misc characters on Fantasy selftext
df_fan['selftext'] = df_fan['selftext'].str.replace('\s[\/]?r\/[^s]+', ' ')
df_fan['selftext'] = df_fan['selftext'].str.replace('http[s]?:\/\/[^\s]*', ' ')
df_fan['selftext'] = df_fan['selftext'].str.replace("[^a-zA-Z]", " ")

In [None]:
# # Using RegEx ro remove misc characters on Official NHL selftext
df_nhl['selftext'] = df_nhl['selftext'].str.replace('\s[\/]?r\/[^s]+', ' ')
df_nhl['selftext'] = df_nhl['selftext'].str.replace('http[s]?:\/\/[^\s]*', ' ')
df_nhl['selftext'] = df_nhl['selftext'].str.replace("[^a-zA-Z]", " ")

In [None]:
# # Using RegEx ro remove misc characters on Fantasy title
df_fan['title'] = df_fan['title'].str.replace('\s[\/]?r\/[^s]+', ' ')
df_fan['title'] = df_fan['title'].str.replace('http[s]?:\/\/[^\s]*', ' ')
df_fan['title'] = df_fan['title'].str.replace("[^a-zA-Z]", " ")

In [None]:
# Checking changes in Fantasy dataframe
df_fan.head()

In [None]:
# # Using RegEx ro remove misc characters on Official NHL title
df_nhl['title'] = df_nhl['title'].str.replace('\s[\/]?r\/[^s]+', ' ')
df_nhl['title'] = df_nhl['title'].str.replace('http[s]?:\/\/[^\s]*', ' ')
df_nhl['title'] = df_nhl['title'].str.replace("[^a-zA-Z]", " ")

In [None]:
# Checking changes in Official NHL dataframe
df_nhl.head()

In [None]:
# checking null values in Fantasy dataframe
df_fan.isnull().sum()

In [None]:
# checking dataframe shape before dropping nulls in Fantasy hockey dataframe
df_fan.shape

In [None]:
# Dropping null values in Fantasy dataframe
df_fan.dropna(inplace = True)
# Checking shape after drop
df_fan.shape

In [None]:
# checking null values in NHL dataframe
df_nhl.isnull().sum()

In [None]:
# Checking shape before dropping nulls in NHL dataframe
df_nhl.shape

In [None]:
# Dropping null values in NHL dataframe
df_nhl.dropna(inplace = True)

In [None]:
# Checking shape for NHL after drop
df_nhl.shape

In [None]:
# Saving cleaned versions to csv

# df_fan.to_csv('./fan_clean.csv')
# df_nhl.to_csv('./nhl_clean')

In [None]:
# Concatenating both Dataframes together
df = pd.concat([df_fan, df_nhl])
df.head()

In [None]:
# Checking shape after concat
df.shape

In [None]:
# Checking empty values and changing them to null values in dataframe
df.replace('', np.nan, inplace = True)
df.isnull().sum()

In [None]:
# dropping null values in dataframe
df.dropna(inplace = True)
# checking shape after drop
df.shape

In [None]:
# Creating dummy columns for subreddit
df['subreddit'] = pd.get_dummies(df['subreddit'], drop_first=True)
df.head()

In [None]:
# Adding selftext and title together in df
df["text"] = df["selftext"].map(str) + df["title"]
df.head()

In [None]:
# Saving data as csv
# df.to_csv('./reddit_complete.csv')

In [3]:
# Importing above csv since restarting notebook
# df = pd.read_csv('./reddit_complete.csv')
# checking df import
df.head()

Unnamed: 0.1,Unnamed: 0,selftext,title,subreddit,text
0,0,Please be nice to each other Upvote useful co...,Daily Anything Goes June,0,Please be nice to each other Upvote useful co...
1,1,Give and receive fantasy team advice in this t...,Roster Management June,0,Give and receive fantasy team advice in this t...
2,2,Please be nice to each other Upvote useful co...,Nightly Anything Goes June,0,Please be nice to each other Upvote useful co...
3,3,Please be nice to each other Upvote useful co...,Daily Anything Goes June,0,Please be nice to each other Upvote useful co...
4,4,Give and receive fantasy team advice in this t...,Roster Management June,0,Give and receive fantasy team advice in this t...


In [4]:
# Dropping unnamed column in df
df.drop('Unnamed: 0', axis = 1, inplace = True)
# Checking df changes
df.head()

Unnamed: 0,selftext,title,subreddit,text
0,Please be nice to each other Upvote useful co...,Daily Anything Goes June,0,Please be nice to each other Upvote useful co...
1,Give and receive fantasy team advice in this t...,Roster Management June,0,Give and receive fantasy team advice in this t...
2,Please be nice to each other Upvote useful co...,Nightly Anything Goes June,0,Please be nice to each other Upvote useful co...
3,Please be nice to each other Upvote useful co...,Daily Anything Goes June,0,Please be nice to each other Upvote useful co...
4,Give and receive fantasy team advice in this t...,Roster Management June,0,Give and receive fantasy team advice in this t...


# Modeling Posts

In [5]:
# Function for wordcloud found 

# https://github.com/amueller/word_cloud/issues/52
def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(9, 100%%, %d%%)" % random.randint(40, 100)

# found on https://stackoverflow.com/questions/16645799/how-to-create-a-word-cloud-from-a-corpus-in-python
def create_wordcloud(data, title = None):
    wordcloud = WordCloud(color_func= grey_color_func,
        background_color='white', # Changes background color
        stopwords=stopwords, # Adds stop words
        max_words=200, # Max words
        max_font_size=40, # font size
        scale=3 
    ).generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [6]:
# Creating X, y Variables
X, y = df['selftext'], df['subreddit']

# Setting up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
# Pipeline & Gridsearch setup
# TFIDF pipeline setup
tvc_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('mb', MultinomialNB())
])

# CountVectorizer pipeline setup
cv_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('mb', MultinomialNB())
])

# Randomforest pipeline setup
rf_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])


# Fit
cv_pipe.fit(X_train, y_train)
tvc_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)

# Setting params for CountVectorizer gridsearch
cvec_params = {
    'cvec__max_features': [100, 2000],
    'cvec__ngram_range': [(1, 1),(1, 2), (2, 2)],
    'cvec__stop_words': [None, 'english']
}

# Setting params for TFIDF Vectorizer gridsearch
tf_params = {
    'tvec__max_features':[100, 2000],
    'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tvec__stop_words': [None, 'english'],
   
}

# Setting up randomforest params
rf_params = {
    'tvec__max_features':[2000],
    'tvec__ngram_range': [(1, 2)],
    'tvec__stop_words': ['english'],
    'rf__max_depth': [1000],
    'rf__min_samples_split': [100],
    'rf__max_leaf_nodes': [None]
}

In [None]:
# After running many features, these were the best parameters

# {'rf__max_depth': 1000,
#  'rf__max_leaf_nodes': None,
#  'rf__min_samples_split': 100,
#  'tvec__max_features': 2000,
#  'tvec__ngram_range': (1, 2),
#  'tvec__stop_words': 'english'}

In [8]:
# Setting up GridSearch for Randomforest
rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, cv = 5, verbose = 1, n_jobs = -1)

# Setting up GridSearch for CountVectorizer
cv_gs = GridSearchCV(cv_pipe, param_grid=cvec_params, cv = 5, verbose = 1, n_jobs = -1)

# Fitting CountVectorizer GS
cv_gs.fit(X_train, y_train)

# Setting up GridSearch for TFIDFVectorizer
tvc_gs = GridSearchCV(tvc_pipe, param_grid=tf_params, cv = 5, verbose =1, n_jobs = -1)

# Fitting CV GS
tvc_gs.fit(X_train, y_train)

# Fitting Randomforest CV GS
rf_gs.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   16.8s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   21.5s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [None]:
# Saving model file in Pickle (rf_gs)
# pkl_file_1 = "rf_selftext.pkl"  
# with open(pkl_file_1, 'wb') as file:  
#     pickle.dump(rf_gs, file)

In [None]:
# Saving model file in Pickle (cv_gs)
# pkl_file_2 = "cv_selftext.pkl"  
# with open(pkl_file_2, 'wb') as file:  
#     pickle.dump(cv_gs, file)

In [None]:
# Saving model file in Pickle (tvc_gs)
# pkl_file_3 = "tvc_selftext.pkl"  
# with open(pkl_file_3, 'wb') as file:  
#     pickle.dump(tvc_gs, file)

In [None]:
# load in model rf_gs
with open(pkl_file_1, 'rb') as file:  
    pickle_model = pickle.load(file)

In [None]:
# Load in model cv_gs
with open(pkl_file_2, 'rb') as file:  
    pickle_model = pickle.load(file)

In [None]:
# Load in model tvc_gs
with open(pkl_file_3, 'rb') as file:  
    pickle_model = pickle.load(file)

In [9]:
# Scoring Training data on CountVectorizer
cv_gs.score(X_train, y_train)

0.8442436264785835

In [10]:
# Scoring Test data on CountVectorizer
cv_gs.score(X_test, y_test)

0.8336271485235787

In [11]:
# Scoring Training data on TFIDFVectorizer
tvc_gs.score(X_train, y_train)

0.8742193813827052

In [12]:
# Scoring Test data on TFIDFVectorizer
tvc_gs.score(X_test, y_test)

0.8627148523578669

In [13]:
# Scoring Training data on RandomForest
rf_gs.score(X_train, y_train)

0.9380648005289839

In [14]:
# Checking Test score on RandomForest
rf_gs.score(X_test, y_test)

0.881004847950639

In [None]:
# Checking best parameters
pickle_model.best_params_

In [None]:
# Creating a new df for  feature importance Random Forest
# Code from Stack Overflow
rf_df = pd.DataFrame(rf_pipe.steps[1][1].feature_importances_, rf_pipe.steps[0][1].get_feature_names(), columns=['importance'])
rf_df.sort_values('importance', ascending = False).head(20)

## Ploting Random Forest Results

In [None]:
# Plotting Top 20 Words in Random Forest
plt.figure(figsize=(20,10))
temp = rf_df.sort_values('importance', ascending = False).head(20)
plt.barh(temp.index, temp['importance'])
plt.title('Top 20 Words', fontsize=40)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('Amount of Information Gained', fontsize=30)
plt.ylabel('Word', fontsize=30);

In [None]:
create_wordcloud(rf_df.sort_values('importance', ascending = False))

## Modeling on 'title'

In [None]:
# Creating X, y Variables
X, y = df['title'], df['subreddit']

# Setting up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Pipeline & Gridsearch setup
# TFIDF pipeline setup
tvc_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('mb', MultinomialNB())
])

# CountVectorizer pipeline setup
cv_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('mb', MultinomialNB())
])

# Randomforest pipeline setup
rf_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])


# Fit
cv_pipe.fit(X_train, y_train)
tvc_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)

# Setting params for CountVectorizer gridsearch
cvec_params = {
    'cvec__max_features': [100, 2000],
    'cvec__ngram_range': [(1, 1),(1, 2), (2, 2)],
    'cvec__stop_words': [None, 'english']
}

# Setting params for TFIDF Vectorizer gridsearch
tf_params = {
    'tvec__max_features':[100, 2000],
    'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tvec__stop_words': [None, 'english'],
   
}

# Setting up randomforest params
rf_params = {
    'tvec__max_features':[2000],
    'tvec__ngram_range': [(1, 2)],
    'tvec__stop_words': ['english'],
    'rf__max_depth': [1000],
    'rf__min_samples_split': [100],
    'rf__max_leaf_nodes': [None]
}

In [None]:
# Setting up GridSearch for Randomforest
rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, cv = 5, verbose = 1, n_jobs = -1)

# Setting up GridSearch for CountVectorizer
cv_gs = GridSearchCV(cv_pipe, param_grid=cvec_params, cv = 5, verbose = 1, n_jobs = -1)

# Fitting CountVectorizer GS
cv_gs.fit(X_train, y_train)

# Setting up GridSearch for TFIDFVectorizer
tvc_gs = GridSearchCV(tvc_pipe, param_grid=tf_params, cv = 5, verbose =1, n_jobs = -1)

# Fitting CV GS
tvc_gs.fit(X_train, y_train)

# Fitting Randomforest CV GS
rf_gs.fit(X_train, y_train)

In [None]:
# scoring Random Forest train
rf_gs.score(X_train, y_train)

In [None]:
# Scoring Random Forest test
rf_gs.score(X_test, y_test)

In [None]:
rf_title = pd.DataFrame(rf_pipe.steps[1][1].feature_importances_, rf_pipe.steps[0][1].get_feature_names(), columns=['importance'])
rf_title.sort_values('importance', ascending = False).head(20)

## Modeling on 'text'

In [15]:
# Creating X, y Variables
X, y = df['text'], df['subreddit']

# Setting up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [16]:
# Pipeline & Gridsearch setup
# TFIDF pipeline setup
tvc_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('mb', MultinomialNB())
])

# CountVectorizer pipeline setup
cv_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('mb', MultinomialNB())
])

# Randomforest pipeline setup
rf_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])


# Fit
cv_pipe.fit(X_train, y_train)
tvc_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)

# Setting params for CountVectorizer gridsearch
cvec_params = {
    'cvec__max_features': [100, 2000],
    'cvec__ngram_range': [(1, 1),(1, 2), (2, 2)],
    'cvec__stop_words': [None, 'english']
}

# Setting params for TFIDF Vectorizer gridsearch
tf_params = {
    'tvec__max_features':[100, 2000],
    'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tvec__stop_words': [None, 'english'],
   
}

# Setting up randomforest params
rf_params = {
    'tvec__max_features':[2000],
    'tvec__ngram_range': [(1, 2)],
    'tvec__stop_words': ['english'],
    'rf__max_depth': [1000],
    'rf__min_samples_split': [100],
    'rf__max_leaf_nodes': [None]
}

In [17]:
# Setting up GridSearch for Randomforest
rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, cv = 5, verbose = 1, n_jobs = -1)

# Setting up GridSearch for CountVectorizer
cv_gs = GridSearchCV(cv_pipe, param_grid=cvec_params, cv = 5, verbose = 1, n_jobs = -1)

# Fitting CountVectorizer GS
cv_gs.fit(X_train, y_train)

# Setting up GridSearch for TFIDFVectorizer
tvc_gs = GridSearchCV(tvc_pipe, param_grid=tf_params, cv = 5, verbose =1, n_jobs = -1)

# Fitting CV GS
tvc_gs.fit(X_train, y_train)

# Fitting Randomforest CV GS
rf_gs.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   18.4s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   19.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.5s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [18]:
cv_gs.score(X_train, y_train)

0.8991991771361398

In [19]:
cv_gs.score(X_test, y_test)

0.9017188188629353

In [20]:
tvc_gs.score(X_train, y_train)

0.9255014326647565

In [21]:
tvc_gs.score(X_test, y_test)

0.9206698986337594

In [22]:
# Scoring Random Forest train
rf_gs.score(X_train, y_train)

0.9747263242965248

In [23]:
# Scoring Random Forest test
rf_gs.score(X_test, y_test)

0.9237549581313353

In [None]:
# Creating a new df for  feature importance Random Forest
rf_feat = pd.DataFrame(rf_pipe.steps[1][1].feature_importances_, rf_pipe.steps[0][1].get_feature_names(), columns=['importance'])
rf_feat.sort_values('importance', ascending = False).head(20)

In [None]:
# Plotting Top 20 Words in Random Forest
plt.figure(figsize=(20,10))
temp = rf_feat.sort_values('importance', ascending = False).head(20)
plt.barh(temp.index, temp['importance'])
plt.title('Top 20 Words', fontsize=40)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('Amount of Information Gained', fontsize=30)
plt.ylabel('Word', fontsize=30);

In [None]:
create_wordcloud(rf_feat.sort_values('importance', ascending = False))

In [None]:
# Saving Random Forest Feat Importance to csv
# rf_feat.to_csv('./rf_feat_imp.csv')

# Extra Modeling

### Modeling Corpus on Random Forest selftext

In [None]:
# Creating a transformed TFIDFVectorizer with best params
tfid = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 1))

# Creating a RandomForest with best params
rf_p = RandomForestClassifier(n_estimators=50)

# Creating corpus and vectorizing training
train_tfid_tf = tfid.fit_transform(X_train)

# Creating corpus and vectorizing testing
test_tfid_tf = tfid.fit_transform(X_test)

In [None]:
# Creating new params and pipeline for RandomForest

rf_p_params = {
    'max_depth':[None],
    'max_leaf_nodes': [100],
    'min_samples_split': [100]
}

# Creating a new pipeline
gs_rf = GridSearchCV(rf_p, param_grid=rf_p_params, cv = 5, n_jobs = -1)

# Fitting model
gs_rf.fit(train_tfid_df, y_train)

# Scoring Training data
gs_rf.score(train_tfid_df, y_train)

In [None]:
# Scoring Testing data
gs_rf.score(test_tfid_df, y_test)

In [None]:
feat_importance = pd.DataFrame(gs_rf.best_estimator_.feature_importances_, train_tfid_df.columns, columns=[ 'importance'])

In [None]:
feat_importance.head()

### Modeling Corpus RandomForest on Title

In [None]:
# Creating X, y Variables
X, y = df['title'], df['subreddit']

# Setting up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Creating a TFIDFVectorizer with best params
tfid = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 1))

# Creating a RandomForest with best params
rf_p = RandomForestClassifier(n_estimators=100)

# Creating corpus and vectorizing training
train_tfid_tf = tfid.fit_transform(X_train)

# Creating corpus and vectorizing testing
test_tfid_tf = tfid.fit_transform(X_test)

In [None]:
# Creating new params and pipeline for RandomForest

rf_p_params = {
    'max_depth':[None],
    'max_leaf_nodes': [50, 100, 500],
    'min_samples_split': [20, 50, 100]
}

# Creating a new pipeline
gs_rf = GridSearchCV(rf_p, param_grid=rf_p_params, cv = 5, n_jobs = -1)

# Fitting model
gs_rf.fit(train_tfid_df, y_train)

# Scoring Training data
gs_rf.score(train_tfid_df, y_train)

In [None]:
# Saving model file in Pickle (gs_rf)
pkl_vect_rft = "vect_rft.pkl"  
with open(pkl_vect_rft, 'wb') as file:  
    pickle.dump(gs_rf, file)

In [None]:
# Testing model
gs_rf.score(test_tfid_tf, y_test)

### Modeling Corpus on Text

In [None]:
# Creating X, y Variables
X, y = df['text'], df['subreddit']

# Setting up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Creating a TFIDFVectorizer with best params
tfid = TfidfVectorizer(stop_words=None, max_features=2000, ngram_range=(1, 1))

# Creating a RandomForest with best params
rf_p = RandomForestClassifier(n_estimators=10)

# Creating corpus and vectorizing training
train_tfid_tf = tfid.fit_transform(X_train)

# Creating corpus and vectorizing testing
test_tfid_tf = tfid.fit_transform(X_test)

In [None]:
# Creating new params and pipeline for RandomForest

rf_p_params = {
    'max_depth':[None],
    'max_leaf_nodes': [50, 100, 500],
    'min_samples_split': [20, 50, 100]
}

# Creating a new pipeline
gs_rf = GridSearchCV(rf_p, param_grid=rf_p_params, cv = 5, n_jobs = -1)

# Fitting model
gs_rf.fit(train_tfid_df, y_train)

# Scoring Training data
gs_rf.score(train_tfid_df, y_train)

In [None]:
gs_rf.score(test_tfid_tf, y_test)

In [None]:
# Creating X, y Variables
X, y = df['text'], df['subreddit']

# Setting up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Creating a TFIDFVectorizer with best params
tfid = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 1))

# Creating corpus and vectorizing training
train_tfid_tf = tfid.fit_transform(X_train)

# Creating corpus and vectorizing testing
test_tfid_tf = tfid.fit_transform(X_test)

# Creating a RandomForest with best params
rf_p = RandomForestClassifier(n_estimators=20)

rf_p.fit(train_tfid_df, y_train)

In [None]:
# making a class for predictions random forest
y_pred = rf_p.predict(test_tfid_df)

In [None]:
# Checking accuracy score of model
metrics.accuracy_score(y_test, y_pred)

In [None]:
# Creating a feature importance for RandomForest
feat_importance = pd.DataFrame(rf_p.feature_importances_, train_tfid_df.columns, columns=['importance'])

In [None]:
# Feature importance df sorted by importance
feat_importance.sort_values('importance', ascending=False)

In [None]:
X, y = df['text'], df['subreddit']

# Setting up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Creating a TFIDFVectorizer with best params
tfid = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 1))

# Creating corpus and vectorizing training
train_tfid_tf = tfid.fit_transform(X_train)

# Creating corpus and vectorizing testing
test_tfid_tf = tfid.fit_transform(X_test)

# Creating a RandomForest with best params
mb = MultinomialNB()

mb.fit(train_tfid_df, y_train)

In [None]:
# making a class for predictions Multinomial NB
y_pred = mb.predict(test_tfid_df)

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
train_word = tfid.get_feature_names()

In [None]:
# number of times each token appears across all HAM messages
word_count = mb.feature_count_[0, :]
word_count

In [None]:
# number of times each token appears across all SPAM messages
spam_count = mb.feature_count_[1, :]
spam_count

In [None]:
# Creating a new df for word count in Multinomial Naives Bayes
mb_df = pd.DataFrame({'word':train_word, 'count':word_count, 'spam':spam_count}).set_index('word')
mb_df.sort_values('count', ascending = False)

# Got help with this from RichieNG website

# Plotting

## Plotting for Multinomial Naives Bayes

In [None]:
# Plotting a hbar graph of Top 20 Words in MB
plt.figure(figsize=(20,10))
temp = mb_df.sort_values('count', ascending=False).head(20)
plt.barh(temp.index, temp['count'])
plt.title('Top 20 Words', fontsize=40)
plt.xlabel('Count', fontsize=30)
plt.yticks(fontsize = 20)
plt.xticks(fontsize = 20)
plt.ylabel('Words', fontsize=30);

In [None]:
# Creating a word cloud of the Top 20 words in MB
create_wordcloud(mb_df.sort_values('count', ascending = False))

## Random Forest Plots

In [None]:
# Plotting Top 20 Words in Random Forest
plt.figure(figsize=(20,10))
temp = feat_importance.sort_values('importance', ascending=False).head(20)
plt.barh(temp.index, temp['importance'])
plt.title('Top 20 Words', fontsize=40)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('Frequency', fontsize=30)
plt.ylabel('Word', fontsize=30);

In [None]:
# Word cloud of Top 20 words in RandomForest
create_wordcloud(feat_importance.sort_values('importance', ascending=False).head(20))