# Reddit Flair Detection

**In this project, I have used the Python Reddit API wrapper (PRAW) to scrape data from the Reddit India subreddit. This is a summary of my work to pre-process, train and classify the data obtained for each post:**
1. **Pre-processing for labels and date-time variables**
2. **Natural Language Processing for text data**
3. **Test-train split of data to check various classifiers**
4. **Comparing various classifiers and their parameters to get the best accuracy**

In [None]:
# import sys
# sys.executable
! pip install praw
! pip install plotly
! pip install --upgrade sklearn
# !{sys.executable} -m pip install praw
# ! conda install -n curr_env scipy

In [1]:
#Importing the necessary Python libraries 
import praw
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as po
import numpy as np
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import datetime

### Creating a Reddit instance and a dictionary to store data

In [2]:
 reddit=praw.Reddit(client_id="xaMlfF_8S0xOlw",client_secret="Hk1K5nCd5v7Hvg9TRkMZo-CELYE",user_agent="R_WebScraping")

# Dictionary to store post data 
reddict = {'title': [], 'selftext': [], 'time' : [], 'upvote_ratio':[],'flair':[]}
posts=reddit.subreddit('india').hot(limit=5000)

### Obtaining the neccessary information for each post

* **selftext : The body text of the post**
* **title : The title of the post**
* **id : The id assigned to each post**
* **num_comments : The number of comments for the post**
* **score : The number of upvotes for the post**
* **upvote_ratio : The percentage of upvotes against all votes for the post**
* **author : The author of the post**
* **comments : The comment text for each post**
* **time : The UTC time of creation of the post**
* **flair : The Flair assigned to the post - This is our target variable**

In [3]:
def get_time(submission):
    time=submission.created
    return time

for post in posts:
    reddict['selftext'].append(post.selftext)
    reddict['title'].append(post.title)
#     reddict['id'].append(post.id)
#     reddict['num_comments'].append(post.num_comments)
#     reddict['score'].append(post.score)
#     reddict['ups'].append(post.ups)
#     reddict['downs'].append(post.downs)
    reddict['upvote_ratio'].append(post.upvote_ratio)
#     reddict['author'].append(post.author)
#     reddict['comments'].append(post.comments)
    reddict['time'].append(get_time(post))
    reddict['flair'].append(post.link_flair_text)

### Saving the data obtained in a CSV file

In [4]:
reddit_df = pd.DataFrame(reddict)

csv = f'indiaReddit.csv'
with open(csv,'w',newline='',encoding='utf-8') as f_out:
    reddit_df.to_csv(csv, index = False, header=True)
 

In [5]:
# from praw.models import MoreComments

# def comments(text):
#     comments_list = []
#     for top_level_comment in text:
#         if isinstance(top_level_comment, MoreComments):
#             continue
#         comments_list.append(top_level_comment.body)
#     return comments_list
    
# reddit_df["comments_body"] = reddit_df["comments"].apply(lambda x: comments(x))
# print(reddit_df["comments_body"][2])

### Some exploratory data analysis

* **A distribution of the unique flairs obtained from the data**

In [6]:
# reddit_df = pd.read_csv("indiaReddit.csv")  

#Basic data informaton
print(reddit_df.describe)

unique, counts = np.unique(list(reddit_df['flair']), return_counts=True)
x=unique
y=counts
plt.bar(x,y)
plt.show()


<bound method NDFrame.describe of                                                  title  \
0    Coronavirus (COVID-19) Megathread - News and U...   
1    Announcing r/IndiaMeme, our own sub for memes ...   
2                         1937 advertisement for India   
3    Indore: Porsche Driver Sit-Ups As Punishment F...   
4    Covid-19 cess, 40% tax for rich - IRS officers...   
5    India's National Board for Wildlife Is a Big T...   
6    Arnab Goswami tells SC all parties in Palghar ...   
7    India can’t conquer COVID-19 without aggressiv...   
8    Jharkhand Police swiftly act to book communal ...   
9                                        PM Cares Fund   
10   My friend went to a barber 10 days back, barbe...   
11   Reliance hired lobbyist Brian Ballard, a close...   
12   How the Indian Railways is reinventing itself ...   
13   How come shops are allowed to sell non-essenti...   
14        Experimenting things. Rainy day at my place.   
15   24 in Vijayawada contract virus a

TypeError: '<' not supported between instances of 'str' and 'NoneType'

### Formatting the DateTime object to get the day of week, month, year of the post

In [7]:
# reddit_df['dateTime'] = pd.to_datetime(reddit_df['time'], 
#  format = '%Y-%m-%dT%H:%M:%SZ', 
#  errors = 'coerce')

def get_time(submission):
    time=submission.created
    return datetime.datetime.fromtimestamp(time).strftime("%H")
reddit_df['month'] = reddit_df['time'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%m"))
reddit_df['month'] = reddit_df['month'].astype(int)
reddit_df['hour'] = reddit_df['time'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%H"))
reddit_df['hour'] = reddit_df['hour'].astype(int)
reddit_df['year'] = reddit_df['time'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y"))
reddit_df['year'] = reddit_df['year'].astype(int)

### Using Natural Language Processing for the text data after combining selftext and title

In [8]:
import string
string.punctuation

string.punctuation = string.punctuation+''
def noPunc(text):
    textNoPunc = "".join([char for char in text if char not in string.punctuation])
    return textNoPunc

reddit_df["title_selftext"] = reddit_df["title"].astype(str) + reddit_df["selftext"].astype(str)
reddit_df["title_selftext_noPunc"] = reddit_df["title_selftext"].apply(lambda x: noPunc(x))


#Tokenize using regex
import re

reddit_df["title_selftext_tokenized2"] = reddit_df["title_selftext_noPunc"].apply(lambda x: re.split('\W+', x.lower()))

#Remove stopwords
import nltk
nltk.download('stopwords')

def noStopWords(text):
    textNoStops = [word.lower() for word in text if word not in nltk.corpus.stopwords.words('english')]
    return textNoStops

reddit_df["title_selftext_noStops"] = reddit_df["title_selftext_tokenized2"].apply(lambda x : noStopWords(x))

#Lemmatizer instead of stemmer
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()

def lemmatizer(text):
    lemmatized = [wn.lemmatize(word) for word in text]
    return lemmatized

reddit_df["title_selftext_lemmatized"] = reddit_df["title_selftext_noStops"].apply(lambda x: lemmatizer(x))


oneList = []
corpusList = []
def corpusCreate(text):
    oneList.append(text)
    for word in text:
        corpusList.append(word)
       
def createMappingMatrix(corpus, list_check, dictCorpus, lenCorpus):
    count = 0
    sampleData = [0]*lenCorpus
    for word in list_check:
        if word in corpus:
            sampleData[dictCorpus[word]] = 1
        return sampleData    


from collections import Counter
reddit_df["title_selftext_lemmatized"].apply(lambda x: corpusCreate(x))
countOfWords = dict(Counter(corpusList))
countOfWords = {v: k for k, v in countOfWords.items()}
revc = list(sorted(countOfWords.keys(), reverse = True))
corpus_sorted = [countOfWords[i] for i in revc if i>5]
dict_corpus_sorted = {word:i for i,word in enumerate(corpus_sorted)}
# print(dict_corpus_sorted["video"])
print(corpus_sorted)

modelData = []
for i in range(len(reddit_df)):
    modelData.append(createMappingMatrix(corpus_sorted, reddit_df.loc[i,"title_selftext_lemmatized"] , dict_corpus_sorted, len(corpus_sorted)))
print(len(corpus_sorted[0]))

modeldf = pd.DataFrame(modelData, columns = [term for term in corpus_sorted])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aditikulkarni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aditikulkarni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['india', 'covid19', 'lockdown', 'time', 'indian', 'people', 'like', 'would', 'government', 'get', 'news', 'one', 'also', 'day', 'help', 'go', 'know', 'need', 'dont', 'think', 'say', 'country', 'month', 'im', 'since', 'good', 'back', 'year', 'delhi', 'could', 'still', 'health', 'guy', 'police', 'problem', 'service', 'much', 'part', 'caste', 'pandemic', 'end', 'last', 'school', 'anyone', 'better', 'open', 'he', 'girl', 'business', 'high', 'ncrb', 'sure', 'china', 'trai', 'neutrality', 'motivational', 'card', 'bangladesh', 'rfp', 'liberation', 'strong']
5


### Removing NaN and null values from data

In [19]:
# reddit_df['ups'] = reddit_df['ups'].replace(np.nan, 0, regex=True)
# modeldf['ups'] = reddit_df['ups']
# reddit_df['downs'] = reddit_df['downs'].replace(np.nan, 0, regex=True)
# modeldf['downs'] = reddit_df['downs']
# reddit_df['num_comments'] = reddit_df['num_comments'].replace(np.nan, 0, regex=True)
# modeldf['num_comments'] = reddit_df['num_comments']
# reddit_df['score'] = reddit_df['score'].replace(np.nan, 0, regex=True)
# modeldf['score'] = reddit_df['score']
# reddit_df['upvote_ratio'] = reddit_df['upvote_ratio'].replace(np.nan, 0, regex=True)
# modeldf['upvote_ratio'] = reddit_df['upvote_ratio']
modeldf['month'] = reddit_df['month'] 

modeldf['hour'] = reddit_df['hour']
modeldf['year'] = reddit_df['year']

### LabelEncoder for the target variable

In [20]:

encoder = LabelEncoder()
reddit_df['flair'] = reddit_df['flair'].replace(np.nan, 'NaN', regex=True)
reddit_df['flair_vec'] = encoder.fit_transform(reddit_df['flair'])

### Test-Train Splt

In [23]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(modeldf,reddit_df['flair_vec'],test_size=0.2)

### RandomForestClassifier

In [None]:
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier()
param = {'n_estimators' : [10, 150, 300,700, 1000, 2000], 'max_depth' : [30, 60, 90, None], 
         'min_samples_split' : [2, 5, 10], 'min_samples_leaf' : [1, 2, 4], 'bootstrap' : [True, False]}
model_rf = GridSearchCV(rf, param, n_jobs = -1)

# model_rf.fit(train_x,train_y)

scores = cross_val_score(model_rf, train_x, train_y, cv=5)
print(scores)
print(scores.mean(), scores.std() * 2)
# predict the target on the train dataset
predict_train_rf = model_rf.predict(train_x)

# Accuray Score on train dataset
accuracy_train_rf = accuracy_score(train_y,predict_train_rf)
print('accuracy_score on train dataset RandomForestClassifier: ', accuracy_train_rf)

# predict the target on the test dataset
predict_test_rf = model_rf.predict(test_x)

# Accuray Score on test dataset
accuracy_test_rf = accuracy_score(test_y,predict_test_rf)
print('accuracy_score on test dataset RandomForestClassifier: ', accuracy_test_rf)


The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.



### Gaussian Naive Bayes Classifier

In [18]:
# from sklearn.naive_bayes import MultinomialNB

# model_mnb = MultinomialNB()
# predict_train = model_mnb.fit(train_x, train_y)

# # predict the target on the train dataset
# predict_train = model_mnb.predict(train_x)

# # Accuray Score on train dataset
# accuracy_train = accuracy_score(train_y,predict_train)
# print('accuracy_score on train dataset KNNeighbors: ', accuracy_train)

# # predict the target on the test dataset
# predict_test = model_mnb.predict(test_x)

# # Accuray Score on test dataset
# accuracy_test = accuracy_score(test_y,predict_test)
# print('accuracy_score on test dataset KNNeighbors: ', accuracy_test)

accuracy_score on train dataset KNNeighbors:  0.3812636165577342
accuracy_score on test dataset KNNeighbors:  0.3322475570032573
