### Problem Statement

As a data scientist working on the next presidential campaign I need to get an understanding of what the voting base cares whether they are Republican or Democrat so that we can build a successful and flexible campaign. In this exercise I will collect user posts from Democrat and Republican subreddits and utilize NLP to identify what topics are top of mind and how each voter base is feeling about these topics utlizing Sentiment Analysis.

In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
%store -r demdata
%store -r repdata

In [3]:
data= pd.concat([repdata, demdata])

In [4]:
#Encode the target variable as numerical. 1 for Republican 0 for Dem
data['subreddit']= data['subreddit'].map({'Republican': 1, 'democrats':0})

In [5]:
data['title'] = data['title'].str.lower()

In [6]:
data.head()

Unnamed: 0,subreddit,title,author,num_comments,title_word_count
0,1,'extremely concerning': elon musk on child por...,BroSteveWinter,1,10
3,1,florida gov. desantis says majority of post-hu...,tbburns2017,1,12
4,1,louden county virginia man arrested on child s...,tbburns2017,1,13
6,1,twitter shares soar on report elon musk agrees...,PinkClouds20,1,13
7,1,"hollywood turns on kamala harris, ends vp’s po...",Patriots-United,1,12


In [7]:
data.isnull().sum()

subreddit           0
title               0
author              0
num_comments        0
title_word_count    0
dtype: int64

In [8]:
#Define X and y
X= data['title']
y= data['subreddit']

#train test split
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state= 42, stratify= y)

In [30]:
y_train.value_counts(normalize = True)

1    0.550396
0    0.449604
Name: subreddit, dtype: float64

In [31]:
y_test.value_counts(normalize = True)

1    0.550633
0    0.449367
Name: subreddit, dtype: float64

In [9]:
#Check distribution of target variable on train and test sets
print(f'Train:\n',y_train.value_counts(normalize=True))
print(f'Test:\n', y_test.value_counts(normalize=True))

Train:
 1    0.550396
0    0.449604
Name: subreddit, dtype: float64
Test:
 1    0.550633
0    0.449367
Name: subreddit, dtype: float64


### Logistic Regression

In [10]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

pipe_params = {
    'cvec__max_features': [2000, 3000, 4000],
    'cvec__stop_words': [None, 'english'],
    'cvec__min_df': [1, 2, 4],
    'cvec__max_df': [1.0, .5],
    'lr__C': [1.0, 0.1],
    'lr__penalty': ['l2', 'none']
}

gs = GridSearchCV(pipe,
                 param_grid=pipe_params,
                 n_jobs = -1)

In [11]:
 gs.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'cvec__max_df': [1.0, 0.5],
                         'cvec__max_features': [2000, 3000, 4000],
                         'cvec__min_df': [1, 2, 4],
                         'cvec__stop_words': [None, 'english'],
                         'lr__C': [1.0, 0.1], 'lr__penalty': ['l2', 'none']})

In [12]:
print(gs.best_score_)
gs.best_params_

0.7160949868073879


{'cvec__max_df': 1.0,
 'cvec__max_features': 4000,
 'cvec__min_df': 2,
 'cvec__stop_words': 'english',
 'lr__C': 1.0,
 'lr__penalty': 'l2'}

In [13]:
pred = gs.predict(X_test)

In [14]:
gs.score(X_train, y_train)

0.9197889182058048

In [15]:
gs.score(X_test, y_test)

0.7162447257383966

### Random Forest + Gridsearch

In [16]:
# #Instantiate the Count Vectorizer
cvec= CountVectorizer(stop_words = 'english')

X_train_rf = X_train
X_test_rf = X_test

# #Fit the model
cvec.fit(X_train_rf)

X_train_rf= cvec.transform(X_train_rf)

# #Transform the test set
X_test= cvec.transform(X_test_rf)

# #Visualize the train data
#train_df = pd.DataFrame(X_train.todense(), columns = cvec.get_feature_names_out())

# train_df.head()

In [17]:
rf= RandomForestClassifier()
rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 5],
    'max_features' : [1, 3, 5],
    'n_jobs': [-1]
}

gs = GridSearchCV(rf, param_grid= rf_params, cv = 3, n_jobs = -1)

gs.fit(X_train_rf, y_train)
print(gs.best_score_)
gs.best_params_

0.7257695690413368


{'max_depth': None, 'max_features': 5, 'n_estimators': 200, 'n_jobs': -1}

In [18]:
gs.score(X_train_rf, y_train)

0.9980650835532102

In [19]:
gs.score(X_test, y_test)

0.7399789029535865

## Boosting Model

In [20]:
#Ada Boost
ada= AdaBoostClassifier(base_estimator = DecisionTreeClassifier())

ada_params= {
    'n_estimators': [50, 100, 150],
    'base_estimator__max_depth':[1,2],
    'learning_rate':[0.6, 1.0]
}

gs = GridSearchCV(ada, param_grid = ada_params, cv = 3)
gs.fit(X_train_rf, y_train)

GridSearchCV(cv=3,
             estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
             param_grid={'base_estimator__max_depth': [1, 2],
                         'learning_rate': [0.6, 1.0],
                         'n_estimators': [50, 100, 150]})

In [21]:
gs.score(X_train_rf, y_train)

0.7759014951627089

In [22]:
gs.score(X_test, y_test)

0.6877637130801688

In [23]:
#Graident Boost
gboost= GradientBoostingClassifier()

gboost_params = {
    'max_depth': [2, 3, 4],
    'n_estimators': [100, 125, 150],
    'learning_rate': [0.8, 1.0, 0.1]
}

gb_gs = GridSearchCV(gboost, param_grid= gboost_params, cv =3)
gb_gs.fit(X_train_rf, y_train)
print(gb_gs.best_score_)
gb_gs.score(X_test, y_test)

0.6863676341248901


0.6751054852320675

In [24]:
#Stacking Models

In [25]:
level1_estimators = [
    ('random_forest', RandomForestClassifier()),
    ('lr', LogisticRegression()),
    ('boost', AdaBoostClassifier())
]

stacked_model = StackingClassifier(estimators=level1_estimators,
                                 final_estimator = LogisticRegression())

In [26]:
# Fit
stacked_model.fit(X_train_rf, y_train)

StackingClassifier(estimators=[('random_forest', RandomForestClassifier()),
                               ('lr', LogisticRegression()),
                               ('boost', AdaBoostClassifier())],
                   final_estimator=LogisticRegression())

In [27]:
# Train score
stacked_model.score(X_train_rf, y_train)

0.9919085312225154

In [28]:
# Test score
stacked_model.score(X_test, y_test)

0.7331223628691983