# Using Reddit's API for Predicting Subreddit Posts

### Scraping Thread Info from Reddit.com

In [1]:
### importing requests, json packages for webscrapping
import requests
import json
import pandas as pd

In [2]:
### subreddits of interest
URL1 = "http://www.reddit.com/r/fakehistoryporn.json"
URL2 = "http://www.reddit.com/r/NatureIsFuckingLit.json"

In [3]:
### Checking HTTP status code, if good to go - 200 means good
res1 = requests.get(URL1, headers={'User-agent': 'Abhi Bot 0.1'})
res2 = requests.get(URL2, headers={'User-agent': 'Abhi Bot 0.2'})
print(res1)
print(res2)

<Response [200]>
<Response [200]>


In [4]:
# ### Initializing the unstructured data
# data1 = res1.json()
# data2 = res2.json()

In [5]:
# ### Exploring the unstructured data
# print(sorted(data1.keys()))
# print(sorted(data2.keys()))
# print(sorted(data1['data'].keys()))
# print(sorted(data2['data'].keys()))
# print(sorted(data1['data']['children'][1]['data'].keys()))

### Subreddit1 - fakehistoryporn

In [6]:
# ### Scrapping->Pandas->CSV for fakehistoryporn
# import time
# url = "http://www.reddit.com/r/fakehistoryporn.json"
# all_posts =[]
# for _ in range(30): 
#     # construct a list of 1000
    
#     # Get the posts by hitting the url, put it in json and store it
#     res1 = requests.get(url, headers={'User-agent': 'boo007'})
#     data1 = res1.json()
    
#     # save only the posts out of the json into the list_of_posts, then
#     # add all the posts to the all_posts list
#     list_of_posts = data1['data']['children']
#     data1_posts = [post['data'] for post in list_of_posts]
#     all_posts.extend(data1_posts)
   
#     # reassign the after to the current 'after', and then update the url to hit
#     after = data1['data']['after']
#     url =  'http://www.reddit.com/r/fakehistoryporn.json?after=' + after
    
#     # go to sleep for 3 seconds so you do not overwhelm reddit and get kicked out
#     print('The current after: ', after)
#     time.sleep(3)

# # now put the list of lists, where which inner list is a row
# # straight into a dataframe
# df1 = pd.DataFrame(all_posts, columns = ['author' ,'domain' ,'num_comments' ,'score' ,'selftext' ,'subreddit' ,'title' ,'ups'])
# df1.head()

In [7]:
# df1.shape

In [8]:
# ### Saving our results as a csv
# pd.DataFrame(all_posts).to_csv('./test.csv', index = False)

### Subreddit2 - NatureIsFuckingLit

In [9]:
# ### Scrapping->Pandas->CSV for NatureIsFuckingLit
# import time
# url = "http://www.reddit.com/r/NatureIsFuckingLit.json"
# all_posts2 =[]
# for _ in range(30): 
#     # construct a list of 1000
    
#     # Get the posts by hitting the url, put it in json and store it
#     res2 = requests.get(url, headers={'User-agent': 'boo007'})
#     data2 = res2.json()
    
#     # save only the posts out of the json into the list_of_posts, then
#     # add all the posts to the all_posts list
#     list_of_posts = data2['data']['children']
#     data2_posts = [post['data'] for post in list_of_posts]
#     all_posts2.extend(data2_posts)
   
#     # reassign the after to the current 'after', and then update the url to hit
#     after = data2['data']['after']
#     url =  'http://www.reddit.com/r/NatureIsFuckingLit.json?after=' + after
    
#     # go to sleep for 3 seconds so you do not overwhelm reddit and get kicked out
#     print('The current after: ', after)
#     time.sleep(3)

# # now put the list of lists, where which inner list is a row
# # straight into a dataframe
# df2 = pd.DataFrame(all_posts2,columns = ['author' ,'domain' ,'num_comments' ,'score' ,'selftext' ,'subreddit' ,'title' ,'ups'])
# df2.head()

In [10]:
# ### Saving our results as a csv
# pd.DataFrame(all_posts2).to_csv('./test2.csv', index = False)

In [11]:
# df2.shape

## Importing Data and Data Analysis

In [12]:
import pandas as pd
df1 = pd.read_csv('./test.csv')  #fakehistoryporn
df2 = pd.read_csv('./test2.csv') #natureisfuckinlit

In [13]:
df = pd.concat([df1, df2], axis=0)
df = pd.DataFrame(df, columns = ['domain' ,'selftext' ,'subreddit' ,'title'])
df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,domain,selftext,subreddit,title
0,discord.gg,,fakehistoryporn,Fake History Porn now has an Discord server!
1,self.fakehistoryporn,This one was a really difficult one to judge b...,fakehistoryporn,Lets get this over with; competition results
2,i.redd.it,,fakehistoryporn,Japan joins the axis (1940 colorised)
3,i.redd.it,,fakehistoryporn,Nazi Luftwaffe pilot marks his kills from the ...
4,i.redd.it,,fakehistoryporn,Kurt Cobain’s famous MTV Unplugged session (1993)


In [14]:
df.shape

(1503, 4)

In [15]:
df.isnull().sum()[df.isnull().sum() > 0]

selftext    1499
dtype: int64

In [16]:
df['selftext'] = df['selftext'].fillna('')

#### We want to predict a binary variable - class `0` for 'natureisfuckinlit' and `1` for 'fakehistoryporn'

In [17]:
df['subreddit']=df['subreddit'].apply(lambda x:1 if x=='fakehistoryporn' else 0)

#### Setting our feature- X and target- y

In [18]:
from sklearn.model_selection import train_test_split
X = df['title'] + df['selftext']
y = df['subreddit']

#### Value Counts for Balanced Classes

In [19]:
pd.value_counts(y)

1    752
0    751
Name: subreddit, dtype: int64

#### Train-Test Split the Data

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle=True,stratify=y,random_state=42,test_size=0.15)

## NLP and Classification Modelling for Subreddits

We use the CountVectorizer and TFIDFVectorizer. The count vectorizer simply counts the word frequencies. The TFIDFvectorizer also counts word(term) frequencies but relative the document frequently. 

#### Importing CountVectorizer and transforming our feature- train/test

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(stop_words="english")
cvec.fit(X_train)
print(len(cvec.get_feature_names()))
X_train_cv = pd.DataFrame(cvec.transform(X_train).todense(),columns=cvec.get_feature_names())
X_test_cv = pd.DataFrame(cvec.transform(X_test).todense(),columns=cvec.get_feature_names())

4241


In [22]:
#columns
columns=cvec.get_feature_names()

In [23]:
X_train_cv.shape

(1277, 4241)

In [24]:
X_test_cv.shape

(226, 4241)

In [25]:
## Importing more Packages
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Logistic regression on Dataset

In [26]:
lr = LogisticRegression()
lr.fit(X_train_cv, y_train)
y_pred = lr.predict(X_test_cv)
print('accuracy score',accuracy_score(y_test, y_pred))
print('accuracy score',lr.score(X_test_cv, y_test))

accuracy score 0.9380530973451328
accuracy score 0.9380530973451328


#### Analyzing Keywords via Beta Coefficients

In [27]:
lr_coef = pd.DataFrame(lr.coef_, columns = columns)
lr_coef

Unnamed: 0,000,01,03,08,09,10,100,100m,1011,1034108552758087682,...,zhangjiajie,zombie,zoo,zoom,zoomed,zoooooom,zootopia,çağırankaya,émile,état
0,0.061594,0.015318,0.001977,-0.04791,0.001977,0.101055,-0.068614,0.132381,0.240063,-0.038274,...,-0.027935,0.052937,-0.235828,-0.180699,-0.048343,-0.075797,0.200985,-0.107912,0.235324,0.329777


In [28]:
df_coef = lr_coef.T.sort_values(by = 0, ascending=False)
df_coef

Unnamed: 0,0
2018,3.354420
colorized,2.999919
circa,2.806465
2017,1.962373
1944,1.620474
2016,1.485514
ad,1.484365
hitler,1.342002
bc,1.328647
colourised,1.323269


#### Confusion Matrix for above

In [29]:
confusion_matrix(y_test, y_pred)

array([[113,   0],
       [ 14,  99]], dtype=int64)

In [30]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,113,0
actual history,14,99


#### Logistic regression with CV is very accurate at predicting nature, there are some errors with history; time to look at other models

In [31]:
y_test.value_counts()

1    113
0    113
Name: subreddit, dtype: int64

#### Logistic Regression with TFIDF - in a pipeline

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer(stop_words='english'),
                      LogisticRegression(),
                      )
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('accuracy score',accuracy_score(y_test, y_pred))

accuracy score 0.9601769911504425


#### Confusion matrix for TFIDF with Logistic Regression

In [33]:
confusion_matrix(y_test, y_pred)

array([[110,   3],
       [  6, 107]], dtype=int64)

In [34]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,110,3
actual history,6,107


#### The accuracy has greatly improved for predicting history

#### Logistic Regression with CountVectorizer - Same as above but in a pipeline

In [35]:
model = make_pipeline(CountVectorizer(stop_words='english'),LogisticRegression())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('accuracy score',accuracy_score(y_test, y_pred))

accuracy score 0.9380530973451328


In [36]:
confusion_matrix(y_test, y_pred)

array([[113,   0],
       [ 14,  99]], dtype=int64)

In [37]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,113,0
actual history,14,99


#### RandomForest with CountVectorizer - Gridsearch Params + Pipeline

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf_model = make_pipeline(CountVectorizer(stop_words='english'),
                      RandomForestClassifier(n_estimators= 7, random_state = 42))
#params={'n_estimators' : [5, 7, 10]}
params={}
gs= GridSearchCV(rf_model, param_grid=params)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_test)
#print('best params', gs.best_params_)
print('accuracy score',accuracy_score(y_test, y_pred))
print('best cv score', gs.best_score_)
print('test score', gs.score(X_test, y_test))

accuracy score 0.9026548672566371
best cv score 0.8566953797963978
test score 0.9026548672566371


#### Our optimum estimators are 7 for Randomforest

In [39]:
confusion_matrix(y_test, y_pred)

array([[112,   1],
       [ 21,  92]], dtype=int64)

In [40]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,112,1
actual history,21,92


#### RandomForest with TFIDFVectorizer - Gridsearch Params + Pipeline

In [41]:
rf_model_2 = make_pipeline(TfidfVectorizer(stop_words='english'),
                      RandomForestClassifier(n_estimators=7, random_state=42))
#params={'n_estimators' : [5, 7, 10]}
gs1= GridSearchCV(rf_model_2, param_grid=params)
gs1.fit(X_train, y_train)
y_pred = gs1.predict(X_test)
print('accuracy score',accuracy_score(y_test, y_pred))
print('best cv score', gs1.best_score_)
print('test score', gs1.score(X_test, y_test))

accuracy score 0.9070796460176991
best cv score 0.8519968676585747
test score 0.9070796460176991


In [42]:
confusion_matrix(y_test, y_pred)

array([[113,   0],
       [ 21,  92]], dtype=int64)

In [43]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,113,0
actual history,21,92


#### Our TFIDF Random Forest makes a slightly better prediction than Count Vectorizer Random Forest with 0 misclassifications of nature v/s 1

### SVM Models

In [44]:
from sklearn import svm

#### SVM with CountVectorizer

A standard SVM seeks to find a margin that separates all positives and negatives in the class. 'C' is the parameter for the soft margin cost function, which controls the influence of each individual support vector; this process involves trading error penalty for stability.Gamma accounts for the variance (inversely related).A small gamma means a Gaussian with a large variance 
Using an optimal C = 10.0 and raising the gamma to 0.1 we get our best parameters for SVM Count Vectorizer, For TFIDF C = 1.0 and gamma = 0.1

In [45]:
svc = svm.SVC(C = 10.0,
              kernel = 'rbf',
              gamma = 0.1)
svc.fit(X_train_cv, y_train)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [46]:
y_pred = svc.predict(X_test_cv)

In [47]:
accuracy_score(y_test, y_pred)

0.911504424778761

In [48]:
confusion_matrix(y_test, y_pred)

array([[113,   0],
       [ 20,  93]], dtype=int64)

In [49]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,113,0
actual history,20,93


#### SVM with TFIDF

In [50]:
svm_model = make_pipeline(TfidfVectorizer(stop_words='english'),
                      svm.SVC(C = 1,
              kernel = 'rbf',
              gamma = 1))
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

In [51]:
accuracy_score(y_test, y_pred)

0.9646017699115044

In [52]:
confusion_matrix(y_test, y_pred)

array([[111,   2],
       [  6, 107]], dtype=int64)

In [53]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,111,2
actual history,6,107


### The SVM and Logistic Regression with TFIDF performs the best among all our models with the highest accuracy

# Executive Summary


In this project, I analyzed and evaluated Reddit's API for predicting a particular subreddit based of text titles/text posts in a given thread.
The problem is a binary classification of two subreddits - 1.fakehistoryporn 2.NatureIsFuckingLit
To solve the problem, my workflow included three parts:

Webscraping: Use requests and json packages to scarpe the above 2 subreddits.
NLP and Modelling: Build classification models that are possible to interpret the importance of text(key word) features (e.g. KNN can’t be used) and choose one with a highest test score of accuracy.
Inferences: Analyze the result of features (coefficient / key word importance) and find the most important key words that can be used to correctly classify a sub-reddit post and successfully predict a future subreddit post.

Here are some findings I got from the classification modeling and the natural language processing:

Comparing the TFIDF Vectorizer and Count vectorizer, the TFIDF Vectorizer performs better for the feature dataset which comprises of the title and selftext.Count Vectorizer simplycounts the number of times a token shows up in the document and uses this value as its weight. TFIDF Vectorizer ensures that the weight assigned to each token not only depends on its frequency in a document but also how recurrent that term is in the entire corpus. This is key in removing function words like pronouns which are very common in an opinionated(comment-based) website like Reddit and which should not be a token of high weightage and hence should be weighted down in TFIDF.
HashingVectorizer and CountVectorizer return identical results. The difference between HV and CV is that HV does not store the names of features (and thus is more memory efficient) whereas CV keeps the names of features.Hence, HashingVectorizer was not used.

Key fakehistoryporn tokens include:
colorized, circa, 1944,	ad, hitler, bc, 1943, american, battle, german, nazi, 1939, 1968, soldier, roman, president, army
Key NatureIsFuckingLit tokens include:
shark, park, kingfisher, iceland, lightning, coast, crab, turtle, wasp, fish, owl, caterpillar, dragonfly, octopus, bird,	
rainbow, nature, sunset, baby,	spider, tree, beautiful

The above makes perfect sense for classifying our data accurately.

3 classification models were used for evaluation of the corpus. The Logistic , Random Forest, SVM

The Logistic approach we get coefficients that estimate how our independent variables(text features) affect our dependent variable(correct subreddit). It therefore is a very good model to use because the dataset is not very big and the model is not being overwhelmed by many features.
The Random Forests model which is a bagging method involving an ensemble of trees is also used for this problem. It performs with very reasonable accuracy by estimating an accurate model by shuffling through the text feature.
Using SVM, every text in our dataset is represented as a vector with thousands of dimensions, every one representing the frequency one of the words of the text. It works very well as our dataset is not very robust.The high dimensional features space means it is great model for avoiding overfitting.

The Logistic and SVM with TFIDF Vectorizer are our best performers with the highest accuracy of post and subreddit classification - nearly 96% accuracy.

Further explorations should incorporate using:
A boosting algorithm for classification like Gradient or Ada Boost. 
Using n-grams to find a phrase instead of a word token for more accurate classification.
Collecting more data and grow the corpus and reevaluate the performance of the models especially the performance of the ensemble of trees(bagging and boosting).

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

We know that a Multinomial Naive Bayes Model works very well with text data, this modelling was performed to test our hypothesis

### Multinomial NB 

#### CV with Naive Bayes

In [54]:
from sklearn.naive_bayes import MultinomialNB

In [55]:
nb = MultinomialNB()
nb.fit(X_train_cv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [56]:
y_pred = nb.predict(X_test_cv)

In [57]:
accuracy_score(y_test, y_pred)

0.9513274336283186

In [58]:
confusion_matrix(y_test, y_pred)

array([[105,   8],
       [  3, 110]], dtype=int64)

In [59]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,105,8
actual history,3,110


#### NB with TFIDF

In [60]:
nb_tfidf = make_pipeline(TfidfVectorizer(stop_words='english'),
                      nb)
nb_tfidf.fit(X_train, y_train)
y_pred = nb_tfidf.predict(X_test)

In [61]:
accuracy_score(y_test, y_pred)

0.9513274336283186

In [62]:
confusion_matrix(y_test, y_pred)

array([[105,   8],
       [  3, 110]], dtype=int64)

In [63]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted nature', 'predicted history'], index=['actual nature', 'actual history'])
cm_df

Unnamed: 0,predicted nature,predicted history
actual nature,105,8
actual history,3,110


The results show that NB model is very accurate in optimizing between the false positives and false negatives for our dataset