# Modeling Baseline

## Setup

In [1]:
# import libraries
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import pickle
from scipy import sparse
import re
import os

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Utility functions for testing models and tracking results

In [2]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'data_size',
                                'features_no',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array 
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'data_size': X.shape[0],
            'features_no': X.shape[1],
            'f1': round(f1_score(y_test, y_pred), 5),
            'acc': round(accuracy_score(y_test, y_pred), 5),
            'recall': round(recall_score(y_test, y_pred), 5),
            'prec': round(precision_score(y_test, y_pred), 5),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 5),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [3]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data (final data file)

In [4]:
df = pd.read_csv('data/data_usampl_60_40_cleaned.csv')
df.shape

(398434, 6)

In [5]:
print('Checking for NaN\'s ...')
print(df.isna().sum())
rows_before = df.shape[0]
print("\nRows before dropping:", rows_before)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
rows_after = df.shape[0]
print('Rows after:', rows_after)
print('Rows dropped:', rows_before - rows_after)

Checking for NaN's ...
raw                    0
clean                  0
clean_pp               0
clean_pp_lemma         0
clean_pp_lemma_stop    0
toxic                  0
dtype: int64

Rows before dropping: 398434
Rows after: 398434
Rows dropped: 0


In [6]:
df.head()

Unnamed: 0,raw,clean,clean_pp,clean_pp_lemma,clean_pp_lemma_stop,toxic
0,Trudeau with a brain? I assume you are taking...,Trudeau with a brain? I assume you are taking ...,trudeau with a brain i assume you are taking a...,trudeau with a brain i assume you be take abou...,trudeau brain assume take pierre imagine,1
1,The Jones Act was immediately lifted to help T...,The Jones Act was immediately lifted to help T...,the jones act was immediately lifted to help t...,the jones act be immediately lift to help texa...,jones act immediately lift help texas florida ...,1
2,As long as the Church keeps preventing the Lor...,As long as the Church keeps preventing the Lor...,as long as the church keeps preventing the lor...,as long as the church keep prevent the lord fr...,long church keep prevent lord call woman sacra...,0
3,"Climate change, in the sense discussed in the ...","Climate change, in the sense discussed in the ...",climate change in the sense discussed in the p...,climate change in the sense discuss in the pop...,climate change sense discuss pope encyclical d...,0
4,Fake news...now she is lying. figures....she i...,Fake news...now she is lying. figures....she i...,fake news now she is lying figures she is maki...,fake news now she be lie figure she be make he...,fake news lie figure make million gosh darn de...,1


## Optional: Create smaller sample from data to speed up experiments

In [7]:
sample_size = None

# uncomment to create sample of desired size
#sample_size = 50_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

Using full data (398434 rows).


## Create label/target variable and check for imbalance

In [9]:
target = df['toxic']

In [10]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

Nontoxic (0): 238652 (59.9 %)
Toxic (1): 159782 (40.1 %)


## Function for bag of words

In [11]:
def bow(data):
    vect = CountVectorizer()
    return vect.fit_transform(data)

## Run baseline model (logistic regression) on different data cols

In [12]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "raw"', bow(df['raw']), target)
store_test_result(test_result)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "clean"', bow(df['clean']), target)
store_test_result(test_result)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "clean_pp"', bow(df['clean_pp']), target)
store_test_result(test_result)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "clean_pp_lemma"', bow(df['clean_pp_lemma']), target)
store_test_result(test_result)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "clean_pp_lemma_stop"', bow(df['clean_pp_lemma_stop']), target)
store_test_result(test_result)

## Show test results + total exec time

In [13]:
test_results

Unnamed: 0,model_name,model_params,data_desc,data_size,features_no,f1,acc,recall,prec,roc_auc,cf_matrix,train_time,notes
0,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""raw""",398434,144000,0.82622,0.86685,0.78927,0.86679,0.92837,"[[43855, 3876], [6734, 25222]]",3m 1s,
1,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""clean""",398434,129114,0.82617,0.86682,0.78921,0.86676,0.92869,"[[43854, 3877], [6736, 25220]]",3m 20s,
2,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""clean_pp""",398434,128741,0.82622,0.86687,0.78921,0.86688,0.92867,"[[43858, 3873], [6736, 25220]]",3m 20s,
3,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""clean_pp_lemma""",398434,115213,0.82544,0.86648,0.78724,0.86754,0.92954,"[[43890, 3841], [6799, 25157]]",3m 29s,
4,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""clean_pp_lemma_stop""",398434,115174,0.82363,0.86521,0.7848,0.8665,0.92815,"[[43867, 3864], [6877, 25079]]",0m 46s,


In [14]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

Full run time: 18m 54s


## Baseline Result

The best result achieved with a LogisticRegression algorithm was 0.53, this will serve as a Baseline comparison reference for further modeling attempts

## Other stuff

### Calculate average comment length on cleaned data (before preproc)

In [15]:
# characters
comm_len_chars = df['clean'].apply(lambda s: len(s))
avg_comm_len_chars = comm_len_chars.sum() / len(comm_len_chars)

# words (rough count)
comm_len_words = df['clean']\
    .apply(lambda s: len(re.findall(r'\S+', s)))
avg_comm_len_words = comm_len_words.sum() / len(comm_len_words)

print('Average comment length:')
print(round(avg_comm_len_chars), 'characters')
print(round(avg_comm_len_words), 'words')

Average comment length:
287 characters
50 words
