In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

In [12]:
df = pd.read_csv('../data/data_modelingV2.csv')

## Train/Test split 

In [13]:
X_major = df.drop(columns='sub_class')
y_major = df['sub_class']
X_train, X_val, y_train, y_val = train_test_split(X_major, y_major, random_state=42, stratify=y_major)

## Model: Random Forest

In [14]:
features_to_drop = ['body', 'score', 'id', 'word_count', 'count_char',
       'word_count_sqrt', 'count_char_sqrt', 'neg', 'neu', 'pos']

X_train_rf = X_train.drop(columns=features_to_drop)
X_val_rf = X_val.drop(columns=features_to_drop)

y_train_rf = y_train
y_val_rf = y_val

### Baseline Accuracy

In [15]:
df['sub_class'].value_counts(normalize=True)

1    0.543779
0    0.456221
Name: sub_class, dtype: float64

In [16]:
df.columns

Index(['body', 'sub_class', 'score', 'id', 'word_count', 'count_char',
       'word_count_sqrt', 'count_char_sqrt', 'abandon', 'abide',
       ...
       'zealand', 'zero', 'zoomers', 'zuckerberg', 'đây', 'đế', 'đồng', 'neg',
       'neu', 'compound'],
      dtype='object', length=4630)

### Model implementation

In [17]:
X_train_rf.columns

Index(['abandon', 'abide', 'ability', 'able', 'abroad', 'absence', 'absolute',
       'absolutely', 'abstract', 'absurd',
       ...
       'youtube', 'yugoslavia', 'zealand', 'zero', 'zoomers', 'zuckerberg',
       'đây', 'đế', 'đồng', 'compound'],
      dtype='object', length=4619)

In [18]:
rf = RandomForestClassifier()

In [19]:
rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train_rf, y_train_rf)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]})

In [20]:
gs.score(X_train_rf, y_train_rf)

0.9849726775956285

In [21]:
gs.score(X_val_rf, y_val_rf)

0.7116564417177914

In [22]:
gs.set_params

<bound method BaseEstimator.set_params of GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]})>

## Model: Logisitic Regression on numerical data

In [23]:
features = ['score', 'word_count', 'count_char',
            'word_count_sqrt', 'count_char_sqrt',
            'neg','neu', 'pos', 'compound' ]

X_train_logreg = X_train[features]
X_val_logreg = X_val[features]

y_train_logreg = y_train
y_val_logreg = y_val

In [24]:
X_train_logreg.dtypes

score                int64
word_count           int64
count_char           int64
word_count_sqrt    float64
count_char_sqrt    float64
neg                float64
neu                float64
pos                  int64
compound           float64
dtype: object

In [25]:
X_train_logreg.isna().sum()

score              0
word_count         0
count_char         0
word_count_sqrt    0
count_char_sqrt    0
neg                0
neu                0
pos                0
compound           0
dtype: int64

In [26]:
params= {
    "C":[0.001,0.01,0.1,1,10,100],
    'solver':['newton-cg', 'lbfgs', 'sag'], 
    "penalty":['l2']
}
logreg = LogisticRegression( max_iter=8_000)
logreg_cv = GridSearchCV(logreg,params,cv=5)
logreg_cv.fit(X_train_logreg,y_train_logreg)



GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=8000),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'sag']})

In [27]:
logreg_cv.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}

In [28]:
logreg_cv.score(X_train_logreg, y_train_logreg)

0.671448087431694

In [29]:
logreg_cv.score(X_val_logreg, y_val_logreg)

0.6523517382413088

## Model: Logisitic Regression on vectorized text

In [30]:
X_train_logreg2 = X_train_rf
X_val_logreg2 = X_val_rf 

y_train_logreg2 = y_train_rf 
y_val_logreg2 = y_val_rf 

In [31]:
params= {
    "C":[0.001,0.01,0.1,1,10,100],
    'solver':['newton-cg'], 
    "penalty":['l2']
}
logreg2 = LogisticRegression(max_iter=4_000)
logreg2_cv = GridSearchCV(logreg2,params,cv=3)
logreg2_cv.fit(X_train_logreg2 ,y_train_logreg2)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=4000),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2'],
                         'solver': ['newton-cg']})

In [32]:
logreg2_cv.score(X_train_logreg2, y_train_logreg2)

0.907103825136612

In [33]:
logreg2_cv.score(X_val_logreg2, y_val_logreg2)

0.7382413087934561

In [34]:
logreg2_cv.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}

## Model: SVM on vectorized text data

In [35]:
X_train_svm = X_train_rf
X_val_svm = X_val_rf 

y_train_svm = y_train_rf 
y_val_svm = y_val_rf 

In [36]:
svm = SVC(C=1.5)
svm.fit(X_train_svm, y_train_svm)

SVC(C=1.5)

In [37]:
svm.score(X_train_svm, y_train_svm)

0.9241803278688525

In [38]:
svm.score(X_val_svm, y_val_svm)

0.7157464212678937

## Model: SVM on vectorized text data + Feature Selection [DINT]

In [39]:
svm_f = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, max_iter=4_000))),
  ('classification', SVC())
])
svm_f.fit(X_train_svm, y_train_svm)

Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, max_iter=4000,
                                                     penalty='l1'))),
                ('classification', SVC())])

In [40]:
svm_f.score(X_train_svm, y_train_svm)

0.9269125683060109

In [41]:
svm_f.score(X_val_svm, y_val_svm)

0.6932515337423313

## Model: Randorm Forest vectorized text + VADER

In [72]:
features_to_drop = ['body', 'id', 'word_count', 'count_char']

X_train_rfc = X_train.drop(columns=features_to_drop)
X_val_rfc = X_val.drop(columns=features_to_drop)

y_train_rfc = y_train
y_val_rfc = y_val

In [77]:
rfc = RandomForestClassifier()
rf_params = {
    'n_estimators':[250, 300, 350, 400],
    'max_depth': [None, 1, 2, 3, 4, 5],
    'max_features': ['auto', 'sqrt', 'log2']
}
gs = GridSearchCV(rfc, param_grid=rf_params, cv=5)
gs.fit(X_train_rfc, y_train_rfc)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [250, 300, 350, 400]})

In [78]:
gs.score(X_train_rfc, y_train_rfc)

0.9979508196721312

In [79]:
gs.score(X_val_rfc, y_val_rfc)

0.7668711656441718

In [81]:
gs.best_estimator_

RandomForestClassifier(max_features='log2', n_estimators=300)

## Model: SVM on vectorized text data + VADER 

In [47]:
features_to_drop = ['body', 'id', 'word_count', 'count_char']

X_train_svm_vader = X_train.drop(columns=features_to_drop)
X_val_svm_vader = X_val.drop(columns=features_to_drop)

y_train_svm_vader = y_train
y_val_svm_vader = y_val

In [61]:
svm_v = SVC(C=2)
svm_v.fit(X_train_svm_vader, y_train_svm_vader)

SVC(C=2)

In [62]:
svm_v.score(X_train_svm_vader, y_train_svm_vader)

0.7206284153005464

In [63]:
svm_v.score(X_val_svm_vader, y_val_svm_vader)

0.6666666666666666

## Model: Logistic regression on vectorized text data + VADER

In [65]:
features_to_drop = ['body', 'id', 'word_count', 'count_char']
X_train_logreg2_vader = X_train.drop(columns=features_to_drop)
X_val_logreg2_vader = X_val.drop(columns=features_to_drop) 

y_train_logreg2_vader = y_train 
y_val_logreg2_vader = y_val 

In [69]:
params_v= {
    "C":[0.00001,0.0001,0.001,0.01,0.1,1,10,100],
    'solver':['newton-cg'], 
    "penalty":['l2']
}
logreg2_vader = LogisticRegression(max_iter=4_000)
logreg2_cv_vader = GridSearchCV(logreg2_vader,params_v,cv=3)
logreg2_cv_vader.fit(X_train_logreg2_vader ,y_train_logreg2_vader)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=4000),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                         'penalty': ['l2'], 'solver': ['newton-cg']})

In [70]:
logreg2_cv_vader.score(X_train_logreg2_vader, y_train_logreg2_vader)

0.9098360655737705

In [71]:
logreg2_cv_vader.score(X_val_logreg2_vader, y_val_logreg2_vader)

0.7525562372188139

In [83]:
logreg2_cv_vader.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}

## Ensemble model Implementation

In [90]:
# Reference for Voting Classifier taken from 
# Lesson 6.04 

vote = VotingClassifier(estimators=[
        ('logreg', LogisticRegression(max_iter=4_000)), ('rf_vader', RandomForestClassifier())], voting='soft')

vote_params = {
    'rf_vader__max_features':['log2'], 
    'rf_vader__n_estimators':[300],
    'logreg__C':[0.1],
    'logreg__penalty': ['l2'],
    'logreg__solver': ['newton-cg']
    
}
gs = GridSearchCV(vote, param_grid=vote_params, cv=3)

gs.fit(X_train_logreg2_vader, y_train_logreg2_vader)

GridSearchCV(cv=3,
             estimator=VotingClassifier(estimators=[('logreg',
                                                     LogisticRegression(max_iter=4000)),
                                                    ('rf_vader',
                                                     RandomForestClassifier())],
                                        voting='soft'),
             param_grid={'logreg__C': [0.1], 'logreg__penalty': ['l2'],
                         'logreg__solver': ['newton-cg'],
                         'rf_vader__max_features': ['log2'],
                         'rf_vader__n_estimators': [300]})

In [91]:
gs.score(X_train_logreg2_vader, y_train_logreg2_vader)

0.9938524590163934

In [92]:
gs.score(X_val_logreg2_vader, y_val_logreg2_vader)

0.7689161554192229