### Problem Statement

As a data scientist working on the next presidential campaign I need to get an understanding of what the voting base cares whether they are Republican or Democrat so that we can build a successful and flexible campaign. In this exercise I will collect user posts from Democrat and Republican subreddits and utilize NLP to identify what topics are top of mind and how each voter base is feeling about these topics utlizing Sentiment Analysis.

In [3]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

In [1]:
%store -r demdata
%store -r repdata

In [4]:
data= pd.concat([repdata, demdata])

In [5]:
#Encode the target variable as numerical. 1 for Republican 0 for Dem
data['subreddit']= data['subreddit'].map({'Republican': 1, 'democrats':0})

In [6]:
data['title'] = data['title'].str.lower()

In [7]:
#Define X and y
X= data['title']
y= data['subreddit']

#train test split
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state= 42, stratify= y)

In [8]:
#Check distribution of target variable on train and test sets
print(f'Train:\n',y_train.value_counts(normalize=True))
print(f'Test:\n', y_test.value_counts(normalize=True))

Train:
 1    0.500233
0    0.499767
Name: subreddit, dtype: float64
Test:
 1    0.5003
0    0.4997
Name: subreddit, dtype: float64


In [9]:
data.isnull().sum()

subreddit     0
title         0
selftext     10
author        0
dtype: int64

### Logistic Regression

In [13]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

pipe_params = {
    'cvec__max_features': [2000, 3000, 4000],
    'cvec__stop_words': [None, 'english'],
    'cvec__min_df': [1, 2, 4],
    'cvec__max_df': [1.0, .5],
    'lr__C': [1.0, 0.1],
    'lr__penalty': ['l2', 'none']
}

gs = GridSearchCV(pipe,
                 param_grid=pipe_params,
                 n_jobs = -1)

In [14]:
 gs.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'cvec__max_df': [1.0, 0.5],
                         'cvec__max_features': [2000, 3000, 4000],
                         'cvec__min_df': [1, 2, 4],
                         'cvec__stop_words': [None, 'english'],
                         'lr__C': [1.0, 0.1], 'lr__penalty': ['l2', 'none']})

In [15]:
print(gs.best_score_)
gs.best_params_

0.6658666297705195


{'cvec__max_df': 1.0,
 'cvec__max_features': 4000,
 'cvec__min_df': 2,
 'cvec__stop_words': 'english',
 'lr__C': 0.1,
 'lr__penalty': 'l2'}

In [16]:
pred = gs.predict(X_test)

In [17]:
gs.score(X_train, y_train)

0.7477152958441732

In [18]:
gs.score(X_test, y_test)

0.6641985191114669

### Random Forest + Gridsearch

In [19]:
# #Instantiate the Count Vectorizer
cvec= CountVectorizer(stop_words = 'english')

X_train_rf = X_train
X_test_rf = X_test

# #Fit the model
cvec.fit(X_train_rf)

X_train_rf= cvec.transform(X_train_rf)

# #Transform the test set
X_test= cvec.transform(X_test_rf)

# #Visualize the train data
#train_df = pd.DataFrame(X_train.todense(), columns = cvec.get_feature_names_out())

# train_df.head()

In [20]:
rf= RandomForestClassifier()
rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 5],
    'max_features' : [1, 3, 5],
    'n_jobs': [-1]
}

gs = GridSearchCV(rf, param_grid= rf_params, cv = 3, n_jobs = -1)

gs.fit(X_train_rf, y_train)
print(gs.best_score_)
gs.best_params_

0.6631979187512508


{'max_depth': None, 'max_features': 5, 'n_estimators': 150, 'n_jobs': -1}

In [21]:
gs.score(X_train_rf, y_train)

0.9869254886265092

In [22]:
gs.score(X_test, y_test)

0.6712027216329798

## EDA

In [28]:
cvec2 = CountVectorizer(stop_words = 'english')

vec_data = cvec2.fit_transform(data['title'])

#Convert to DataFrame 
vec_data = pd.DataFrame(vec_data.toarray(), columns= cvec2.get_feature_names())

#Concat with original DataFrame
vec_data = pd.concat([data, vec_data])



In [29]:
vec_data[vec_data['subreddit'] == 0].sum().sort_values(ascending= False).head(30)

  vec_data[vec_data['subreddit'] == 0].sum().sort_values(ascending= False).head(30)


trump          1132.0
biden           850.0
democrats       350.0
gop             342.0
republicans     318.0
fbi             317.0
new             272.0
says            271.0
house           236.0
president       218.0
election        211.0
joe             205.0
abortion        205.0
republican      204.0
lago            191.0
mar             191.0
senate          185.0
court           171.0
desantis        162.0
poll            148.0
just            145.0
state           143.0
white           143.0
people          139.0
america         135.0
raid            133.0
video           133.0
florida         131.0
news            129.0
inflation       121.0
dtype: float64

In [30]:
vec_data[vec_data['subreddit'] == 1].sum().sort_values(ascending= False).head(30)

  vec_data[vec_data['subreddit'] == 1].sum().sort_values(ascending= False).head(30)


subreddit               3255.0
political                  1.0
gov                        1.0
politics                   1.0
receiving                  1.0
democratic                 1.0
immigration                1.0
approve                    1.0
blockloanforgiveness       1.0
biden                      1.0
loan                       1.0
greg                       1.0
karma                      1.0
kicked                     1.0
plan                       1.0
joke                       1.0
programmed                 1.0
new                        1.0
humor                      1.0
little                     1.0
help                       1.0
abbott                     1.0
piss                       1.0
proof                      1.0
stop                       1.0
birds                      1.0
response                   1.0
student                    1.0
protest                    1.0
make                       1.0
dtype: float64

In [31]:
vec_data[vec_data['subreddit'] == 1]

Unnamed: 0,subreddit,title,selftext,000,02,03,06,07,08,09,...,влиянии,на,немного,психику,родителей,сдвг,хабр,司馬南版,敦促蔡英文及其軍政首腦投降書,蔡英文
0,1,report: elite colorado resorts panic as migran...,,,,,,,,,...,,,,,,,,,,
1,1,dow ends nearly 460 points lower as nasdaq plu...,,,,,,,,,...,,,,,,,,,,
2,1,late gop rep jackie walorski’s brother respond...,,,,,,,,,...,,,,,,,,,,
3,1,the ‘sticky’ high prices unlikely to come down...,,,,,,,,,...,,,,,,,,,,
4,1,‘apparent sabotage’: nordic states probe russi...,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,1,0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1183,1,0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1292,1,0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2834,1,0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
tempdf = pd.DataFrame(X_train_rf.toarray(), columns= cvec.get_feature_names())



In [34]:
tempdf[tempdf['subreddit']== 1].sum().sort_values(ascending = False)

subreddit    3
response     1
stop         1
biden        1
plan         1
            ..
failure      0
fails        0
failing      0
failed       0
蔡英文          0
Length: 8596, dtype: int64

##### Republican Posts

In [39]:
rep_vec = CountVectorizer(stop_words = 'english')

repdata['title'] = repdata['title'].str.lower()

X_rep= repdata['title']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repdata['title'] = repdata['title'].str.lower()


In [40]:
X_rep= rep_vec.fit_transform(X_rep)

repdf = pd.DataFrame(X_rep.todense(), columns = rep_vec.get_feature_names_out())

In [44]:
repdf.sum().sort_values(ascending = False).head(50)

trump           504
biden           489
fbi             247
new             145
republicans     134
says            125
joe             111
raid            106
president       105
republican      100
gop             100
election         99
video            98
democrats        97
desantis         96
just             87
lago             86
mar              86
house            85
inflation        80
people           79
hunter           77
america          74
white            69
florida          68
poll             67
news             66
doj              65
democrat         65
report           64
state            64
americans        60
red              58
covid            58
like             57
2024             57
media            53
american         53
watch            50
donald           49
senate           48
make             47
claims           47
gov              47
court            46
conservative     46
maga             46
amp              45
police           45
speech           45


### Dem Posts

In [45]:
dem_vec = CountVectorizer(stop_words = 'english')

demdata['title'] = demdata['title'].str.lower()

X_dem= demdata['title']

X_dem= dem_vec.fit_transform(X_dem)

demdf = pd.DataFrame(X_dem.todense(), columns = dem_vec.get_feature_names_out())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demdata['title'] = demdata['title'].str.lower()


In [46]:
demdf.sum().sort_values(ascending = False).head(50)

trump          628
biden          362
democrats      253
gop            242
republicans    184
abortion       179
house          151
says           146
senate         137
new            128
court          125
president      113
election       112
lago           105
mar            105
republican     104
joe             94
judge           86
supreme         84
texas           83
poll            81
state           79
vote            76
white           74
fbi             70
donald          67
desantis        66
manchin         66
news            63
documents       63
florida         63
right           63
america         61
democratic      61
special         61
people          60
rights          59
climate         59
roe             58
just            58
party           56
race            56
act             55
voters          55
jan             55
help            55
say             52
2022            50
maga            50
doj             50
dtype: int64