# Using Reddit's API for Predicting Comments - Part 2
# Classification

## Import data saved in csv files

In [1]:
import pandas as pd
import numpy as np
list_of_posts_board = pd.DataFrame.from_csv("./list_of_posts_boardgame.csv")

list_of_posts_truered = pd.DataFrame.from_csv("./list_of_posts_true_reddit.csv")




In [2]:
len(list_of_posts_board), len(list_of_posts_truered)

(940, 963)

In [3]:
import ast

In [4]:
list_of_posts_board_extract = []
for post in list_of_posts_board['data'][1:]:
    post = ast.literal_eval(post)
    if post['selftext'] == '':
        continue
    else:
        list_of_posts_board_extract.append(post['selftext'])

In [5]:
len(list_of_posts_board_extract)

812

#### Class 0 for "True Reddits" and 1 for "Board Games"

In [6]:
pd_board = pd.DataFrame(list_of_posts_board_extract, columns=['post'])
pd_board['class'] = 1
pd_board.head()

Unnamed: 0,post,class
0,**Welcome to /r/boardgames Daily Discussion an...,1
1,I love board games and have a small collection...,1
2,"I read the rules, I watch a game play video bu...",1
3,I love the game and am finally looking at gett...,1
4,I've been playing these two games a lot lately...,1


In [7]:
list_of_posts_truered_extract = []
for post in list_of_posts_truered['data'][1:]:
    post = ast.literal_eval(post)
#    post_ext = {}
    if post['title'] == '':
        continue
    else:
#        post_ext['selftext'] = post['selftext']
        
        list_of_posts_truered_extract.append(post['title'])

In [8]:
len(list_of_posts_truered_extract)

962

In [9]:
pd_truered = pd.DataFrame(list_of_posts_truered_extract, columns=['post'])
pd_truered['class'] = 0
pd_truered.head()

Unnamed: 0,post,class
0,Billionaires v teachers: the Koch brothers' pl...,0
1,Jose Manuel Martinez — a doting grandfather kn...,0
2,How Electric Scooters Are Reshaping Cities,0
3,Taylor Swift and the Cult of Early Success,0
4,Richard Sackler became a multi-billionaire aft...,0


In [10]:
# Combine two lists together
list_concat = pd.concat([pd_board, pd_truered])

In [11]:
list_concat.drop('class', axis = 1).head()

Unnamed: 0,post
0,**Welcome to /r/boardgames Daily Discussion an...
1,I love board games and have a small collection...
2,"I read the rules, I watch a game play video bu..."
3,I love the game and am finally looking at gett...
4,I've been playing these two games a lot lately...


In [138]:
list_concat.shape

(1774, 2)

In [139]:
list_concat.head(10)

Unnamed: 0,post,class
0,**Welcome to /r/boardgames Daily Discussion an...,1
1,I love board games and have a small collection...,1
2,"I read the rules, I watch a game play video bu...",1
3,I love the game and am finally looking at gett...,1
4,I've been playing these two games a lot lately...,1
5,Hey guys. Looking to get into either of these ...,1
6,[Here is the mission card](https://ksr-ugc.img...,1
7,https://i.imgur.com/ac4WAMK.jpg\n(Sorry for po...,1
8,"One of the minor secrets, the MultiPass, reads...",1
9,I'm not in a habit of sleeving my cards so for...,1


## NLP

#### Use CountVectorizer and TfidfVectorizer from scikit-learn to create features from the thread contents

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
cvec = CountVectorizer()
tfidf = TfidfVectorizer()



In [14]:
rfc = RandomForestClassifier()

In [15]:
cvec_trans = cvec.fit_transform(list_concat['post'])
tfidf_trans = tfidf.fit_transform(list_concat['post'])

In [16]:
X_cvec = pd.DataFrame(cvec_trans.todense(), columns = cvec.get_feature_names())
X_tfidf = pd.DataFrame(tfidf_trans.todense(), columns = tfidf.get_feature_names())

In [149]:
X_cvec.iloc[:,2000:4000].head()

Unnamed: 0,atlantis,atlas,atleast,atmosphere,atmospheric,atomwaffen,atrocities,attached,attachment,attack,...,delve,delvers,delving,demand,demands,dementia,demigods,demo,democracy,democratic
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [162]:
X_tfidf.iloc[:,2000:4000].head(5)

Unnamed: 0,atlantis,atlas,atleast,atmosphere,atmospheric,atomwaffen,atrocities,attached,attachment,attack,...,delve,delvers,delving,demand,demands,dementia,demigods,demo,democracy,democratic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Predicting subreddit using Random Forest Classification

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [18]:
X_cvec.shape, X_tfidf.shape, list_concat.shape

((1774, 13170), (1774, 13170), (1774, 2))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_cvec,list_concat['class'], test_size = 0.25, random_state = 42)

In [20]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X_tfidf,list_concat['class'], test_size = 0.25, random_state = 42)

#### Baseline accuracy for this model

In [21]:
list_concat['class'].value_counts()

0    962
1    812
Name: class, dtype: int64

* For this particular dataset, our baseline accuracy should be proportional to our data blance: 969/(969+811) = 54.4%

####  RandomForestClassifier model

In [156]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_train_predict = rf.predict(X_train)
print("train accuracy score: ", rf.score(X_train, y_train).round(4))
print("test accuracy score: ", rf.score(X_test, y_test).round(4))

train accuracy score:  0.997
test accuracy score:  0.9392


In [157]:
rf1 = RandomForestClassifier(random_state=42)
rf1.fit(X1_train, y1_train)
y1_train_predict = rf1.predict(X1_train)
print("train accuracy score: ", rf1.score(X1_train, y1_train).round(4))
print("test accuracy score: ", rf1.score(X1_test, y1_test).round(4))

train accuracy score:  0.997
test accuracy score:  0.9234


#### Cross-validate the model

In [24]:
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report

In [25]:
cross_scores_rf = cross_val_score(RandomForestClassifier(random_state=42), X_train, y_train)
print(cross_scores_rf.mean().round(4))

y_test_predict = rf.predict(X_test)
report_test = classification_report(y_test, y_test_predict)
print(report_test)

0.9376
             precision    recall  f1-score   support

          0       0.90      1.00      0.95       237
          1       0.99      0.87      0.93       207

avg / total       0.94      0.94      0.94       444



In [26]:
cross_scores_rf1 = cross_val_score(RandomForestClassifier(random_state=42), X1_train, y1_train)
print(cross_scores_rf1.mean().round(4))

y1_test_predict = rf1.predict(X1_test)
report_test1 = classification_report(y1_test, y1_test_predict)
print(report_test1)

0.9316
             precision    recall  f1-score   support

          0       0.88      1.00      0.93       237
          1       0.99      0.84      0.91       207

avg / total       0.93      0.92      0.92       444



* CountVectorizer and Tfidf create similar results and Tfidf vectorizer is sightly better in recall.

#### Use GridSearchCV with Pipeline to optimize results

In [129]:
from sklearn.metrics import make_scorer, accuracy_score

score_function = make_scorer(accuracy_score)
params = {
    "cvec__stop_words"  : [None, 'english'],
    "cvec__ngram_range" : [(1,1), (1,2)],
    "rf__n_estimators"  : [40, 60, 80],
    "rf__criterion"     : ["gini", "entropy"],
    "rf__max_depth"     : [30, 50, 70],
    "rf__random_state"  : [42]
             }
steps = [('cvec', CountVectorizer()), ('rf', RandomForestClassifier())]
pipe = Pipeline(steps = steps)
gs = GridSearchCV(pipe, param_grid = params , scoring = score_function, verbose=1)

In [130]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(list_concat['post'], list_concat['class'], test_size = 0.3, random_state = 42)

In [131]:
gs.fit(X_train_rf, y_train_rf)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:  2.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cvec__stop_words': [None, 'english'], 'cvec__ngram_range': [(1, 1), (1, 2)], 'rf__n_estimators': [40, 60, 80], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth': [30, 50, 70], 'rf__random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(accuracy_score), verbose=1)

In [132]:
gs.best_params_

{'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None,
 'rf__criterion': 'gini',
 'rf__max_depth': 50,
 'rf__n_estimators': 60,
 'rf__random_state': 42}

In [158]:
gs.best_score_.round(4)

0.9678

In [159]:
gs.score(X_test_rf, y_test_rf).round(4)

0.9512

In [154]:
y_test_predict = gs.predict(X_test_rf)
report_test = classification_report(y_test_rf, y_test_predict)
print(report_test)

             precision    recall  f1-score   support

          0       0.93      0.99      0.96       294
          1       0.98      0.91      0.94       239

avg / total       0.95      0.95      0.95       533



## Logistic Regression Model with Hyperparmeter tuning

In [63]:
from sklearn.linear_model import LogisticRegression

In [122]:
score_function = make_scorer(accuracy_score)
params_lr = {
    "cvec__stop_words"  : [None, 'english'],
    "cvec__max_df"      : [1.0, 3.0],
    "cvec__ngram_range" : [(1,1), (1,2)],
    "cvec__lowercase"   : ['True', 'False'],
    "logic__C"  : [0.1 ,1 , 5],
    "logic__max_iter"     : [5, 10, 20],
    "logic__random_state"  : [42]
             }
steps_lr = [('cvec', CountVectorizer()), ('logic', LogisticRegression())]
pipe_lr = Pipeline(steps = steps_lr)
gs_lr = GridSearchCV(pipe_lr, param_grid = params_lr , scoring = score_function, verbose=1)

In [123]:
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(list_concat['post'], list_concat['class'], test_size = 0.3, random_state = 42)

In [124]:
gs_lr.fit(X_train_lr, y_train_lr)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  2.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cvec__stop_words': [None, 'english'], 'cvec__max_df': [1.0, 3.0], 'cvec__ngram_range': [(1, 1), (1, 2)], 'cvec__lowercase': ['True', 'False'], 'logic__C': [0.1, 1, 5], 'logic__max_iter': [5, 10, 20], 'logic__random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(accuracy_score), verbose=1)

In [1]:
gs_lr.best_params_

NameError: name 'gs_lr' is not defined

In [160]:
gs_lr.best_score_.round(4)

0.9686

In [161]:
gs_lr.score(X_test_lr, y_test_lr).round(4)

0.9606

In [155]:
y_test_predict = gs.predict(X_test_lr)
report_test = classification_report(y_test_lr, y_test_predict)
print(report_test)

             precision    recall  f1-score   support

          0       0.93      0.99      0.96       294
          1       0.98      0.91      0.94       239

avg / total       0.95      0.95      0.95       533



# Executive Summary


Classify your Subreddit Post by Machine Learning!

There are hundreds and thousands of posts per day on reddit.com and we definitely want computers to be able to “recognize” each post by its subreddit category for multiple usages!
But how?

Machine learning algorithms can easily help! In this project, we showcase how this is done by machine learning. Two popular kinds of subreddit posts are scrapped from reddit.com: Boardgames and True reddits. NLP is used next to analyze the word frequency for each post and a model is built on top of that to predict which category the post belongs to. The parameters are fine tuned later to optimize results. 96 of 100 posts can be categorized correctly.

I believe there are more we can work to expand the results to a more diverse and in depth application. 
