# Using Reddit's API for Predicting Comments - Part 2
# Classification

## Import data saved in csv files

In [None]:
import pandas as pd
import numpy as np
list_of_posts_board = pd.DataFrame.from_csv("./list_of_posts_boardgame.csv")

list_of_posts_truered = pd.DataFrame.from_csv("./list_of_posts_true_reddit.csv")

In [None]:
len(list_of_posts_board), len(list_of_posts_truered)

In [None]:
import ast

In [None]:
list_of_posts_board_extract = []
for post in list_of_posts_board['data'][1:]:
    post = ast.literal_eval(post)
    if post['selftext'] == '':
        continue
    else:
        list_of_posts_board_extract.append(post['selftext'])

In [None]:
len(list_of_posts_board_extract)

#### Class 0 for "True Reddits" and 1 for "Board Games"

In [None]:
pd_board = pd.DataFrame(list_of_posts_board_extract, columns=['post'])
pd_board['class'] = 1
pd_board.head()

In [None]:
list_of_posts_truered_extract = []
for post in list_of_posts_truered['data'][1:]:
    post = ast.literal_eval(post)
#    post_ext = {}
    if post['title'] == '':
        continue
    else:
#        post_ext['selftext'] = post['selftext']
        
        list_of_posts_truered_extract.append(post['title'])

In [None]:
len(list_of_posts_truered_extract)

In [None]:
pd_truered = pd.DataFrame(list_of_posts_truered_extract, columns=['post'])
pd_truered['class'] = 0
pd_truered.head()

In [None]:
# Combine two lists together
list_concat = pd.concat([pd_board, pd_truered])

In [None]:
list_concat.drop('class', axis = 1).head()

In [None]:
list_concat.shape

In [None]:
list_concat.head(10)

## NLP
#### Use CountVectorizer and TfidfVectorizer from scikit-learn to create features from the thread contents

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
cvec = CountVectorizer()
tfidf = TfidfVectorizer()

  from numpy.core.umath_tests import inner1d


In [None]:
rfc = RandomForestClassifier()

In [None]:
cvec_trans = cvec.fit_transform(list_concat['post'])
tfidf_trans = tfidf.fit_transform(list_concat['post'])

In [None]:
X_cvec = pd.DataFrame(cvec_trans.todense(), columns = cvec.get_feature_names())
X_tfidf = pd.DataFrame(tfidf_trans.todense(), columns = tfidf.get_feature_names())

In [None]:
X_cvec.iloc[:,2000:4000].head()

In [None]:
X_tfidf.iloc[:,2000:4000].head(5)

## Predicting subreddit using Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
X_cvec.shape, X_tfidf.shape, list_concat.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cvec,list_concat['class'], test_size = 0.25, random_state = 42)

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X_tfidf,list_concat['class'], test_size = 0.25, random_state = 42)

#### Baseline accuracy for this model

In [None]:
list_concat['class'].value_counts()

* For this particular dataset, our baseline accuracy should be proportional to our data blance: 969/(969+811) = 54.4%

####  RandomForestClassifier model

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_train_predict = rf.predict(X_train)
print("train accuracy score: ", rf.score(X_train, y_train).round(4))
print("test accuracy score: ", rf.score(X_test, y_test).round(4))

In [None]:
rf1 = RandomForestClassifier(random_state=42)
rf1.fit(X1_train, y1_train)
y1_train_predict = rf1.predict(X1_train)
print("train accuracy score: ", rf1.score(X1_train, y1_train).round(4))
print("test accuracy score: ", rf1.score(X1_test, y1_test).round(4))

In [None]:
#### Cross-validate the model

In [None]:
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report

In [None]:
cross_scores_rf = cross_val_score(RandomForestClassifier(random_state=42), X_train, y_train)
print(cross_scores_rf.mean().round(4))

y_test_predict = rf.predict(X_test)
report_test = classification_report(y_test, y_test_predict)
print(report_test)

In [None]:
cross_scores_rf1 = cross_val_score(RandomForestClassifier(random_state=42), X1_train, y1_train)
print(cross_scores_rf1.mean().round(4))

y1_test_predict = rf1.predict(X1_test)
report_test1 = classification_report(y1_test, y1_test_predict)
print(report_test1)

In [None]:
* CountVectorizer and Tfidf create similar results and Tfidf vectorizer is sightly better in recall.

#### Use GridSearchCV with Pipeline to optimize results

In [None]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(list_concat['post'], list_concat['class'], test_size = 0.3, random_state = 42)

In [None]:
gs.fit(X_train_rf, y_train_rf)

In [None]:
gs.best_params_

In [None]:
gs.best_score_.round(4)

In [None]:
gs.score(X_test_rf, y_test_rf).round(4)

In [None]:
y_test_predict = gs.predict(X_test_rf)
report_test = classification_report(y_test_rf, y_test_predict)
print(report_test)

## Logistic Regression Model with Hyperparmeter tuning

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
score_function = make_scorer(accuracy_score)
params_lr = {
    "cvec__stop_words"  : [None, 'english'],
    "cvec__max_df"      : [1.0, 3.0],
    "cvec__ngram_range" : [(1,1), (1,2)],
    "cvec__lowercase"   : ['True', 'False'],
    "logic__C"  : [0.1 ,1 , 5],
    "logic__max_iter"     : [5, 10, 20],
    "logic__random_state"  : [42]
             }
steps_lr = [('cvec', CountVectorizer()), ('logic', LogisticRegression())]
pipe_lr = Pipeline(steps = steps_lr)
gs_lr = GridSearchCV(pipe_lr, param_grid = params_lr , scoring = score_function, verbose=1)

In [None]:
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(list_concat['post'], list_concat['class'], test_size = 0.3, random_state = 42)

In [None]:
gs_lr.fit(X_train_lr, y_train_lr)

In [None]:
gs_lr.best_params_

In [None]:
gs_lr.best_score_.round(4)

In [None]:
gs_lr.score(X_test_lr, y_test_lr).round(4)

In [None]:
y_test_predict = gs.predict(X_test_lr)
report_test = classification_report(y_test_lr, y_test_predict)
print(report_test)

# Executive Summary

In [None]:
Classify your Subreddit Post by Machine Learning!

There are hundreds and thousands of posts per day on reddit.com and we definitely want computers to be able to “recognize” each post by its subreddit category for multiple usages!
But how?

Machine learning algorithms can easily help! In this project, we showcase how this is done by machine learning. Two popular kinds of subreddit posts are scrapped from reddit.com: Boardgames and True reddits. NLP is used next to analyze the word frequency for each post and a model is built on top of that to predict which category the post belongs to. The parameters are fine tuned later to optimize results. 96 of 100 posts can be categorized correctly.

This project used logistic regression and random forest classifier for classification model and both gave very high accuracy scores of 95% ~ 96%. Logistic regression is sightly better and the difference is minor that they are essentially equivalent in terms of accuracy. The optimization using gridserachCV is very useful and compationaly expensive so a minimum optimization was carried out and good convergence on train and test scores achieved. The precision, recall and f1-score listed as a reference but accuracy is our target metrics cause we don't care to optimize either false negatives or false positives in this particular case.

I believe there are more we can work to expand the results to a more diverse and in depth application. 
