In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

In [2]:
df = pd.read_csv('../data/data_modelingV3.csv')

## Train/Test split 

In [3]:
X_major = df.drop(columns='sub_class')
y_major = df['sub_class']
X_train, X_val, y_train, y_val = train_test_split(X_major, y_major, random_state=42, stratify=y_major)

## Model: Random Forest

In [4]:
features_to_drop = ['body', 'score', 'id', 'word_count', 'count_char',
       'word_count_sqrt', 'count_char_sqrt', 'neg', 'neu', 'pos']

X_train_rf = X_train.drop(columns=features_to_drop)
X_val_rf = X_val.drop(columns=features_to_drop)

y_train_rf = y_train
y_val_rf = y_val

### Baseline Accuracy

In [5]:
df['sub_class'].value_counts(normalize=True)

1    0.543779
0    0.456221
Name: sub_class, dtype: float64

### Model implementation

In [6]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, stratify=y)

In [7]:
rf = RandomForestClassifier()

In [8]:
rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train_rf, y_train_rf)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]})

In [9]:
gs.score(X_train_rf, y_train_rf)

0.9760928961748634

In [10]:
gs.score(X_val_rf, y_val_rf)

0.6646216768916156

In [11]:
gs.set_params

<bound method BaseEstimator.set_params of GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]})>

## Model: Logisitic Regression on numerical data

In [12]:
features = ['score', 'word_count', 'count_char',
            'word_count_sqrt', 'count_char_sqrt',
            'neg','neu', 'pos', 'compound' ]

X_train_logreg = X_train[features]
X_val_logreg = X_val[features]

y_train_logreg = y_train
y_val_logreg = y_val

In [13]:
X_train_logreg.dtypes

score                int64
word_count           int64
count_char           int64
word_count_sqrt    float64
count_char_sqrt    float64
neg                float64
neu                float64
pos                float64
compound           float64
dtype: object

In [14]:
X_train_logreg.isna().sum()

score              0
word_count         0
count_char         0
word_count_sqrt    0
count_char_sqrt    0
neg                0
neu                0
pos                0
compound           0
dtype: int64

In [15]:
params= {
    "C":[0.001,0.01,0.1,1,10,100],
    'solver':['newton-cg', 'lbfgs', 'sag'], 
    "penalty":['l2']
}
logreg = LogisticRegression( max_iter=8_000)
logreg_cv = GridSearchCV(logreg,params,cv=5)
logreg_cv.fit(X_train_logreg,y_train_logreg)

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=8000),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'sag']})

In [16]:
logreg_cv.best_params_

{'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}

In [17]:
logreg_cv.score(X_train_logreg, y_train_logreg)

0.6823770491803278

In [18]:
logreg_cv.score(X_val_logreg, y_val_logreg)

0.6421267893660532

## Model: Logisitic Regression on vectorized text

In [19]:
X_train_logreg2 = X_train_rf
X_val_logreg2 = X_val_rf 

y_train_logreg2 = y_train_rf 
y_val_logreg2 = y_val_rf 

In [20]:
params= {
    "C":[0.001,0.01,0.1,1,10,100],
    'solver':['newton-cg'], 
    "penalty":['l2']
}
logreg2 = LogisticRegression(max_iter=4_000)
logreg2_cv = GridSearchCV(logreg2,params,cv=3)
logreg2_cv.fit(X_train_logreg2 ,y_train_logreg2)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=4000),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2'],
                         'solver': ['newton-cg']})

In [21]:
logreg2_cv.score(X_train_logreg2, y_train_logreg2)

0.8709016393442623

In [22]:
logreg2_cv.score(X_val_logreg2, y_val_logreg2)

0.7075664621676891

In [23]:
logreg2_cv.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}

## Model: SVM on vectorized text data

In [24]:
X_train_svm = X_train_rf
X_val_svm = X_val_rf 

y_train_svm = y_train_rf 
y_val_svm = y_val_rf 

In [25]:
svm = SVC(C=1.5)
svm.fit(X_train_svm, y_train_svm)

SVC(C=1.5)

In [26]:
svm.score(X_train_svm, y_train_svm)

0.9064207650273224

In [27]:
svm.score(X_val_svm, y_val_svm)

0.6789366053169734

## Model: SVM on vectorized text data + Feature Selection [DINT]

In [28]:
svm_f = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, max_iter=4_000))),
  ('classification', SVC())
])
svm_f.fit(X_train_svm, y_train_svm)

Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, max_iter=4000,
                                                     penalty='l1'))),
                ('classification', SVC())])

In [29]:
svm_f.score(X_train_svm, y_train_svm)

0.89275956284153

In [30]:
svm_f.score(X_val_svm, y_val_svm)

0.6687116564417178

## Model: Randorm Forest vectorized text + VADER

In [31]:
features_to_drop = ['body', 'id', 'word_count', 'count_char']

X_train_rfc = X_train.drop(columns=features_to_drop)
X_val_rfc = X_val.drop(columns=features_to_drop)

y_train_rfc = y_train
y_val_rfc = y_val

In [32]:
rfc = RandomForestClassifier()
rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rfc, param_grid=rf_params, cv=5)
gs.fit(X_train_rfc, y_train_rfc)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4, 5],
                         'n_estimators': [100, 150, 200]})

In [33]:
gs.score(X_train_rfc, y_train_rfc)

0.9972677595628415

In [34]:
gs.score(X_val_rfc, y_val_rfc)

0.7321063394683026