In [41]:
import operator
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
from tqdm import tqdm

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [42]:
training_path = Path().resolve().parents[0] / 'data' / 'clean_training_data.csv'

In [43]:
df = pd.read_csv(training_path)

In [44]:
df.columns

Index(['Unnamed: 0', 'original_text', 'label', 'text_length',
       'complexity_scores', 'token_count', 'stopword_count', 'sum_cs',
       'avg_cs'],
      dtype='object')

In [45]:
df = df.drop(['Unnamed: 0', 'original_text'], axis=1)
df = df.dropna()

# Experimental
df['stopword_ratio'] = df['stopword_count'] / df['token_count']
df_small = df.sample(frac=0.1)

## Random Forest Classifier Tuning

In [None]:
X = df_small[['text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs', 'stopword_ratio']]
y = df_small['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
random_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, 11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [8]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=random_grid, 
    n_iter=20, 
    cv=3, 
    verbose=2, 
    random_state=100, 
    n_jobs=-1
)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 100, 200, 300, 400,
                                                         500]},
                   random_state=100, verbose=2)

In [9]:
rf_random.best_params_

"""
0.6408 Accuracy / 0.66 F1
{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}
"""

"\n0.6408 Accuracy / 0.66 F1\n{'n_estimators': 200,\n 'min_samples_split': 10,\n 'min_samples_leaf': 4,\n 'max_features': 'sqrt',\n 'max_depth': 20,\n 'bootstrap': True}\n"

In [10]:
accuracy = accuracy_score(rf_random.predict(X_test), y_test)
print(accuracy)

0.617940999397953


In [11]:
f1 = f1_score(rf_random.predict(X_test), y_test)
print(f1)

0.643763332210621


## Tuned Random Forest Full Data

In [None]:
X = df[['text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs', 'stopword_ratio']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
params = dict(
    n_estimators=200,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    max_depth=20,
    bootstrap=True
)
rfc = RandomForestClassifier(**params).fit(X_train, y_train)

In [13]:
predictions = rfc.predict(X_test)

accuracy = accuracy_score(predictions, y_test)
print(accuracy)

f1 = f1_score(predictions, y_test)
print(f1)

0.5986754966887418
0.6139233175026063


## AdaBoost Tuning

In [27]:
X = df_small[['text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs', 'stopword_ratio']]
y = df_small['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
parameters = {
    'base_estimator__max_depth': [2, 5, 10],
    'base_estimator__min_samples_leaf': [5, 10, 25],
    'n_estimators': [50, 250, 500],
    'learning_rate': [0.001, 0.01, 0.1]
}

In [15]:
abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
abc_gridsearch = GridSearchCV(abc, parameters, cv=3, verbose=3, n_jobs=-1, scoring='f1')
abc_gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


GridSearchCV(cv=3,
             estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
             n_jobs=-1,
             param_grid={'base_estimator__max_depth': [2, 5, 10],
                         'base_estimator__min_samples_leaf': [5, 10, 25],
                         'learning_rate': [0.001, 0.01, 0.1],
                         'n_estimators': [50, 250, 500]},
             scoring='f1', verbose=3)

In [16]:
abc_gridsearch.best_params_

{'base_estimator__max_depth': 2,
 'base_estimator__min_samples_leaf': 5,
 'learning_rate': 0.001,
 'n_estimators': 50}

In [17]:
accuracy = accuracy_score(abc_gridsearch.predict(X_test), y_test)
print(accuracy)

0.589885611077664


In [18]:
f1 = f1_score(abc_gridsearch.predict(X_test), y_test)
print(f1)

0.6730658475715109


## Tuned AdaBoost Full Data

In [49]:
X = df[['text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs', 'stopword_ratio']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [50]:
dtc = DecisionTreeClassifier(max_depth=2, min_samples_leaf=5)
abc = AdaBoostClassifier(n_estimators=50, learning_rate=0.001).fit(X_train, y_train)

In [51]:
predictions = abc.predict(X_test)

accuracy = accuracy_score(predictions, y_test)
print(accuracy)

f1 = f1_score(predictions, y_test)
print(f1)

0.6005322543742398
0.6761745785378902


## 5 Base Model Performance Check

In [None]:
F1_score_df = pd.DataFrame(columns = ['model','text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs'])
models = ['LinearSVC','GaussianNB','MultinomialNB','RandomForestClassifier','AdaBoostClassifier']
F1_score_df['model'] = models

In [8]:
SVC = LinearSVC()
GNB = GaussianNB()
MNB = MultinomialNB()
RFC = RandomForestClassifier()
ABC = AdaBoostClassifier()

trainers = [SVC, GNB, MNB, RFC, ABC]
features = ['text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs']

index_count=0

for trainer in tqdm(trainers):
    for feature in features:
        trainer.fit(X_train[feature].values.reshape(-1,1), y_train)
        predictions = trainer.predict(X_test[feature].values.reshape(-1,1))
        F1_score_df.at[index_count,feature] = f1_score(y_test, predictions)
        #print(index_count, feature, f1_score(y_test, predictions))
    index_count+=1
    

index_count = 0    
for trainer in tqdm(trainers):
    trainer.fit(X_train, y_train)
    predictions = trainer.predict(X_test)
    F1_score_df.at[index_count,'all'] = f1_score(y_test, predictions)
    index_count+=1


100%|██████████| 5/5 [07:07<00:00, 85.53s/it] 
100%|██████████| 5/5 [01:44<00:00, 20.88s/it]


In [9]:
F1_score_df

Unnamed: 0,model,text_length,token_count,stopword_count,sum_cs,avg_cs,all
0,LinearSVC,0.345473,0.620011,0.550464,0.67341,0.667378,0.001009
1,GaussianNB,0.527426,0.514891,0.511717,0.491069,0.634302,0.54292
2,MultinomialNB,0.667378,0.667378,0.667378,0.667378,0.667378,0.593224
3,RandomForestClassifier,0.659501,0.619978,0.663055,0.619196,0.567369,0.615517
4,AdaBoostClassifier,0.659501,0.620011,0.663068,0.598456,0.634175,0.634751


In [40]:
df = pd.DataFrame({
    'Base Model': ['LinearSCV', 'GaussianNB', 'MultinomialNB', 'RandomForestClassifier', 'AdaBoostClassifier'],
    'F1 Score': [0.001009, 0.542920, 0.593224, 0.615517, 0.634751]
})
df.head()

Unnamed: 0,Base Model,F1 Score
0,LinearSCV,0.001009
1,GaussianNB,0.54292
2,MultinomialNB,0.593224
3,RandomForestClassifier,0.615517
4,AdaBoostClassifier,0.634751


## Test Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [35]:
X = df[['text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs', 'stopword_ratio']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [36]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [38]:
y_pred = lr.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
print(f"Accuracy: {accuracy}")

f1 = f1_score(y_pred, y_test)
print(f"F1 Score: {f1}")

roc_auc = roc_auc_score(y_pred, y_test)
print(f"ROC AUC Score: {roc_auc}")

Accuracy: 0.6077935527377383
F1 Score: 0.5829812296740161
ROC AUC Score: 0.6094334929716576


In [58]:
# Check Balance
print(f"y=0: {df[df.label == 0].label.shape[0]}")
print(f"y=1: {df[df.label == 1].label.shape[0]}")

y=0: 207511
y=1: 207700
