In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [2]:
with open('./pickles/custom_words.pkl', 'rb') as x:
    custom_words = pickle.load(x)

In [3]:
df= pd.read_csv('./data/cleaned_posts.csv')

In [4]:
X=df['selftext']
y=df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=20, stratify=y)

#### Logistic Regression

In [5]:
pipe_lr= make_pipeline(
CountVectorizer(stop_words=custom_words),
    LogisticRegression(max_iter=1000, random_state=20)
)

In [6]:
pipe_lr.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(stop_words=['running', 'run', 'marathon',
                                             'race', 'runs', 'ran', 'shoes',
                                             'runner', 'runners', 'pain',
                                             'miles', 'mile', 'track', 'post',
                                             'treadmill', 'app', 'title',
                                             'wondering', 'foot', 'pace',
                                             'jogging', 'shoe', 'https',
                                             'thread', 'curious', 'week', '5k',
                                             'haven', 'body', 'atl', ...])),
                ('logisticregression',
                 LogisticRegression(max_iter=1000, random_state=20))])

In [7]:
pipe_lr.score(X_train, y_train), pipe_lr.score(X_test, y_test)

(0.972704140643072, 0.9403192227619709)

#### Random Forests

In [29]:
pipe_rfc = make_pipeline(
    CountVectorizer(stop_words=custom_words),
    RandomForestClassifier(random_state=20)
)

In [30]:
pipe_rfc.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(stop_words=['running', 'run', 'marathon',
                                             'race', 'runs', 'ran', 'shoes',
                                             'runner', 'runners', 'pain',
                                             'miles', 'mile', 'track', 'post',
                                             'treadmill', 'app', 'title',
                                             'wondering', 'foot', 'pace',
                                             'jogging', 'shoe', 'https',
                                             'thread', 'curious', 'week', '5k',
                                             'haven', 'body', 'atl', ...])),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=20))])

In [31]:
pipe_rfc.score(X_train, y_train), pipe_rfc.score(X_test, y_test)

(0.9752486699051585, 0.9306037473976405)

#### Decision Trees

In [11]:
pipe_dtc = make_pipeline(
    CountVectorizer(stop_words=custom_words),
    DecisionTreeClassifier()
)

In [12]:
pipe_dtc.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(stop_words=['running', 'run', 'marathon',
                                             'race', 'runs', 'ran', 'shoes',
                                             'runner', 'runners', 'pain',
                                             'miles', 'mile', 'track', 'post',
                                             'treadmill', 'app', 'title',
                                             'wondering', 'foot', 'pace',
                                             'jogging', 'shoe', 'https',
                                             'thread', 'curious', 'week', '5k',
                                             'haven', 'body', 'atl', ...])),
                ('decisiontreeclassifier', DecisionTreeClassifier())])

In [13]:
pipe_dtc.score(X_train, y_train), pipe_dtc.score(X_test, y_test)

(0.9752486699051585, 0.9139486467730743)

#### Extra Trees

In [32]:
pipe_etc = make_pipeline(
    CountVectorizer(stop_words=custom_words),
    ExtraTreesClassifier(random_state=20)
)

In [33]:
pipe_etc.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(stop_words=['running', 'run', 'marathon',
                                             'race', 'runs', 'ran', 'shoes',
                                             'runner', 'runners', 'pain',
                                             'miles', 'mile', 'track', 'post',
                                             'treadmill', 'app', 'title',
                                             'wondering', 'foot', 'pace',
                                             'jogging', 'shoe', 'https',
                                             'thread', 'curious', 'week', '5k',
                                             'haven', 'body', 'atl', ...])),
                ('extratreesclassifier',
                 ExtraTreesClassifier(random_state=20))])

In [16]:
pipe_etc.score(X_train, y_train), pipe_etc.score(X_test, y_test)

(0.9752486699051585, 0.9354614850798056)

#### K Nearest Neighbors

In [17]:
pipe_knn = make_pipeline(
    CountVectorizer(stop_words=custom_words),
    KNeighborsClassifier()
)

In [18]:
pipe_knn.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(stop_words=['running', 'run', 'marathon',
                                             'race', 'runs', 'ran', 'shoes',
                                             'runner', 'runners', 'pain',
                                             'miles', 'mile', 'track', 'post',
                                             'treadmill', 'app', 'title',
                                             'wondering', 'foot', 'pace',
                                             'jogging', 'shoe', 'https',
                                             'thread', 'curious', 'week', '5k',
                                             'haven', 'body', 'atl', ...])),
                ('kneighborsclassifier', KNeighborsClassifier())])

In [19]:
pipe_knn.score(X_train,y_train), pipe_knn.score(X_test,y_test)

(0.9044644922507518, 0.8660652324774463)

#### Naive Bayes



In [22]:
nb = GaussianNB()
nb.fit(X_train_cv.A, y_train)

GaussianNB()

In [23]:
nb.score(X_train_cv.A, y_train), nb.score(X_test_cv.A, y_test)

(0.885264862364099, 0.8147120055517002)

#### Voting Classifier

In [21]:
cv= CountVectorizer(stop_words=custom_words)
cv.fit(X_train)
X_train_cv= cv.transform(X_train)
X_test_cv= cv.transform(X_test)

In [34]:
vc1 = VotingClassifier(
    [
        ('lr',LogisticRegression(random_state=20)),
        ('etc',ExtraTreesClassifier(random_state=20)),
        ('rfc', RandomForestClassifier(random_state=20))
    ],
    n_jobs=-1
    
)

In [35]:
vc1.fit(X_train_cv,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=20)),
                             ('etc', ExtraTreesClassifier(random_state=20)),
                             ('rfc', RandomForestClassifier(random_state=20))],
                 n_jobs=-1)

In [36]:
vc1.score(X_train_cv, y_train), vc1.score(X_test_cv, y_test)

(0.9752486699051585, 0.9424011103400416)

#### Boosting

#### Ada Boost

In [52]:
from sklearn.ensemble import AdaBoostClassifier

pipe_abc = make_pipeline(
    CountVectorizer(stop_words=custom_words),
    AdaBoostClassifier()
)

pipe_abc.fit(X_train,y_train)

pipe_abc.score(X_train,y_train), pipe_abc.score(X_test,y_test)

(0.9403192227619709, 0.9312977099236641)

#### Gradient Boost

In [49]:
pipe_gbc = make_pipeline(
    CountVectorizer(stop_words=custom_words),
    GradientBoostingClassifier()
)

pipe_gbc.fit(X_train,y_train)
pipe_gbc.score(X_train,y_train), pipe_gbc.score(X_test,y_test)

(0.936618089289845, 0.9278278972935462)

#### Pickling & Exporting for later use

In [37]:
with open('./pickles/pipe_lr.pkl', 'wb') as pickle_out:
    pickle.dump(pipe_lr, pickle_out)

In [38]:
with open('./pickles/pipe_rfc.pkl', 'wb') as pickle_out:
    pickle.dump(pipe_rfc, pickle_out)

In [39]:
with open('./pickles/pipe_dtc.pkl', 'wb') as pickle_out:
    pickle.dump(pipe_dtc, pickle_out)

In [40]:
with open('./pickles/pipe_etc.pkl', 'wb') as pickle_out:
    pickle.dump(pipe_etc, pickle_out)

In [41]:
with open('./pickles/pipe_knn.pkl', 'wb') as pickle_out:
    pickle.dump(pipe_knn, pickle_out)

In [42]:
with open('./pickles/vc1.pkl', 'wb') as pickle_out:
    pickle.dump(vc1, pickle_out)

In [50]:
with open('./pickles/pipe_abc.pkl', 'wb') as pickle_out:
    pickle.dump(pipe_abc, pickle_out)

In [51]:
with open('./pickles/pipe_gbc.pkl', 'wb') as pickle_out:
    pickle.dump(pipe_gbc, pickle_out)