In [15]:
import operator
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
training_path = Path().resolve().parents[0] / 'data' / 'clean_training_data.csv'

In [3]:
df = pd.read_csv(training_path)

In [4]:
df.columns

Index(['Unnamed: 0', 'original_text', 'label', 'text_length',
       'complexity_scores', 'token_count', 'stopword_count', 'sum_cs',
       'avg_cs'],
      dtype='object')

In [5]:
df = df.drop(['Unnamed: 0', 'original_text'], axis=1)
df = df.dropna()

# Experimental
df['stopword_ratio'] = df['stopword_count'] / df['token_count']
df = df.sample(frac=0.5)

In [6]:
# X = df[['text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs']]
X = df[['stopword_ratio', 'sum_cs']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Random Forest Classifier

In [8]:
random_grid = {
    'n_estimators': [int(x) for x in np.linspace(200, 1200, 10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, 11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [10]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=random_grid, 
    n_iter=20, 
    cv=3, 
    verbose=2, 
    random_state=100, 
    n_jobs=-1
)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


 0.59392832        nan 0.59984102        nan 0.60951083 0.60053944
 0.59530112        nan 0.59958812 0.6081621         nan        nan
 0.5966619         nan]


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [11]:
rf_random.best_params_

"""
0.6408 Accuracy / 0.66 F1
{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}
"""

"\n0.6408 Accuracy / 0.66 F1\n{'n_estimators': 200,\n 'min_samples_split': 10,\n 'min_samples_leaf': 4,\n 'max_features': 'sqrt',\n 'max_depth': 20,\n 'bootstrap': True}\n"

In [12]:
accuracy = accuracy_score(rf_random.predict(X_test), y_test)
print(accuracy)

0.6239583835075382


In [None]:
f1 = f1_score(rf_random.predict(X_test), y_test)
print(f1)

In [7]:
params = dict(
    n_estimators=800,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    max_depth=20,
    bootstrap=True
)
rfc = RandomForestClassifier(**params).fit(X_train, y_train)

In [8]:
predictions = rfc.predict(X_test)

accuracy = accuracy_score(predictions, y_test)
print(accuracy)

f1 = f1_score(predictions, y_test)
print(f1)

0.6206589277973122
0.6429397229841542


## AdaBoost

In [9]:
random_grid = {
    'dtc__criterion': ['gini', 'entropy'],
    'dtc__splitter': ['best', 'random'],
    'abc__n_estimators': [int(x) for x in np.linspace(50, 500, 50)],
    'abc__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5]
}

In [10]:
dtc = DecisionTreeClassifier(random_state=100, max_features='auto', class_weight='auto', max_depth = None)
abc = AdaBoostClassifier(base_estimator=dtc)
abc_random = RandomizedSearchCV(
    estimator=abc, 
    param_distributions=random_grid, 
    n_iter=20, 
    cv=3, 
    verbose=2, 
    random_state=100, 
    n_jobs=-1,
    scoring='roc_auc'
)
abc_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


RandomizedSearchCV(cv=3, estimator=AdaBoostClassifier(), n_iter=20, n_jobs=-1,
                   param_distributions={'learning_rate': [0.001, 0.01, 0.1, 0.2,
                                                          0.5],
                                        'n_estimators': [50, 59, 68, 77, 86, 95,
                                                         105, 114, 123, 132,
                                                         141, 151, 160, 169,
                                                         178, 187, 196, 206,
                                                         215, 224, 233, 242,
                                                         252, 261, 270, 279,
                                                         288, 297, 307, 316, ...]},
                   random_state=100, verbose=2)

In [11]:
abc_random.best_params_

{'n_estimators': 380, 'learning_rate': 0.5}

In [12]:
accuracy = accuracy_score(abc_random.predict(X_test), y_test)
print(accuracy)

0.6056548335821974


In [13]:
f1 = f1_score(abc_random.predict(X_test), y_test)
print(f1)

0.6287243208924765


## 5 Base Model Performance Check

In [None]:
F1_score_df = pd.DataFrame(columns = ['model','text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs'])
models = ['LinearSVC','GaussianNB','MultinomialNB','RandomForestClassifier','AdaBoostClassifier']
F1_score_df['model'] = models

In [8]:
SVC = LinearSVC()
GNB = GaussianNB()
MNB = MultinomialNB()
RFC = RandomForestClassifier()
ABC = AdaBoostClassifier()

trainers = [SVC, GNB, MNB, RFC, ABC]
features = ['text_length', 'token_count', 'stopword_count', 'sum_cs', 'avg_cs']

index_count=0

for trainer in tqdm(trainers):
    for feature in features:
        trainer.fit(X_train[feature].values.reshape(-1,1), y_train)
        predictions = trainer.predict(X_test[feature].values.reshape(-1,1))
        F1_score_df.at[index_count,feature] = f1_score(y_test, predictions)
        #print(index_count, feature, f1_score(y_test, predictions))
    index_count+=1
    

index_count = 0    
for trainer in tqdm(trainers):
    trainer.fit(X_train, y_train)
    predictions = trainer.predict(X_test)
    F1_score_df.at[index_count,'all'] = f1_score(y_test, predictions)
    index_count+=1


100%|██████████| 5/5 [07:07<00:00, 85.53s/it] 
100%|██████████| 5/5 [01:44<00:00, 20.88s/it]


In [9]:
F1_score_df

Unnamed: 0,model,text_length,token_count,stopword_count,sum_cs,avg_cs,all
0,LinearSVC,0.345473,0.620011,0.550464,0.67341,0.667378,0.001009
1,GaussianNB,0.527426,0.514891,0.511717,0.491069,0.634302,0.54292
2,MultinomialNB,0.667378,0.667378,0.667378,0.667378,0.667378,0.593224
3,RandomForestClassifier,0.659501,0.619978,0.663055,0.619196,0.567369,0.615517
4,AdaBoostClassifier,0.659501,0.620011,0.663068,0.598456,0.634175,0.634751
