authors: Noah Jones and Daniel Low



## Imports

In [None]:
# !pip install -q lightgbm==3.3.3
# !pip install -q contractions==0.1.73
# !pip install --user -U nltk==3.8.1

In [2]:
import numpy as np
import random
import pandas as pd

import os 
import re

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold

from sklearn.naive_bayes import ComplementNB
from lightgbm import LGBMClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import (
    auc,
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
)
from sklearn.compose import ColumnTransformer
from sklearn.base import clone

import contractions
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')


ModuleNotFoundError: No module named 'lightgbm'

## Load Datasets and set parameters

In [None]:
pd.set_option("display.max_columns", None)
# pd.options.display.width = 0


on_colab = False

if on_colab:
    from google.colab import drive
    project_name = 'project_name'
    drive.mount('/content/drive')
    input_dir = '/content/drive/MyDrive/datum/rallypoint_suicide_detection/data/input/'
    output_dir = '/content/drive/MyDrive/datum/rallypoint_suicide_detection/data/output/'
else:
    input_dir = './data/input/final_datasets/'
    output_dir = './data/output/performance/'


In [None]:
# load data
train = pd.read_csv(input_dir+'train.csv', index_col=0)
val = pd.read_csv(input_dir+'val.csv', index_col=0)
test = pd.read_csv(input_dir+'test.csv', index_col=0)
print(train.shape, test.shape)

# We'll use CV for hyperparameter tuning, so we'll concatenate the train and val data
train = train.append(val).reset_index(drop=True)
train = train.sample(frac=1).reset_index(drop=True)

X_train_text = train['content']
X_train_metadata = train[['type_tag_content','contact_size','reputation','type']]
y_train = train['label'].values

X_test_text = test['content']
X_test_metadata = test[['type_tag_content','contact_size','reputation','type']]
y_test = test['label'].values

print(X_train_text.shape, X_train_metadata.shape, y_train.shape, X_test_text.shape, X_test_metadata.shape, y_test.shape)



## Helper Functions

In [None]:
SEED_VALUE = 10
np.random.seed(SEED_VALUE)

kf = KFold(n_splits=5, shuffle=False, random_state=None)


In [None]:
import metrics_report # local script
from sklearn import metrics
def save_results_df(pipeline_gridsearch, X_train, y_train, X_test, y_test, model_name, output_dir= './', ts = None, results = {}):
    
    best_params = {}
    for k,v in pipeline_gridsearch.best_params_.items():
        best_params['estimator__'+k] = v

    pipeline_gridsearch.set_params(**best_params)
    print('training with best params...')
    pipeline_gridsearch.fit(X_train, y_train)
    print('done. evaluating on test set')
    y_pred_proba = pipeline_gridsearch.predict_proba(X_test)

    y_pred_proba_1 = y_pred_proba[:,1]
    y_pred = [np.argmax(n) for n in y_pred_proba]
    clf_report_sklearn = metrics.classification_report(y_test,y_pred, output_dict=False) #evaluate #different than the output of cross_validate() above.
    cm_df_meaning, cm_df, cm_df_norm = metrics_report.cm(y_test, y_pred, output_dir, model_name, ts, save=True)
    clf_report = metrics_report.classification_report(y_test,y_pred,y_pred_proba_1, output_dir, model_name, ts)
    # scores = cross_validate(pipe, X, y, scoring=['f1','precision', 'recall'], cv=cv, return_train_score=False) #train and evaluate    
    results[model_name] = {
        'clf_report': clf_report,
        'cm_df_meaning': cm_df_meaning,
        'cm_df': cm_df,
        'cm_df_norm': cm_df_norm,
        'clf_report_sklearn': clf_report_sklearn

    }

    return results




def preprocess(string):
    string = string.replace('|body|',"")
    string = re.sub(r"http\S+", "", string)
    string = re.sub(r"www+", "", string)
    #fix contractions
    string = contractions.fix(string,slang=False)
    return string

def custom_tokenizer(string):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(string)
    return words




# Hyperparameter tuning values

In [None]:
toy = False

In [None]:

if toy:
    print('WARNING, running toy version')
    param_grid = {
       'vectorizer__max_features': [2000, None],
        'lgbm__min_child_samples': [10, 20], #alias: min_data_in_leaf

    }
else:
    param_grid = {
        'vectorizer__max_features': [256, 2048, None],
        'lgbm__colsample_bytree': [1, 0.5, 0.1],
        'lgbm__max_depth': [-1,10,20], #-1 is the default and means No max depth
        'lgbm__min_child_weight': [0.01, 0.001, 0.0001],
        'lgbm__min_child_samples': [10, 20,40], #alias: min_data_in_leaf

    }

# Text model

In [None]:
param_grid

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek #chosen to reduce amount of false negatives by increasing false positives
#"The combination was shown to provide a reduction in false negatives at the cost of an increase in false positives for a binary classification task."
from imblearn.pipeline import Pipeline as imb_Pipeline


In [None]:
%%time
text_features = 'content'

ts_i = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')


vectorizer = TfidfVectorizer(analyzer='word', binary=False,
                 decode_error='strict',
                 encoding='utf-8', input='content',
                 lowercase=True, max_df=0.8, max_features=None,
                 min_df=3, ngram_range=(1,2), norm='l2',
                 preprocessor=preprocess, smooth_idf=True,
                 stop_words=stopwords.words('english'), strip_accents='unicode',
                 sublinear_tf=True,
                 token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=custom_tokenizer, use_idf=True,
                 vocabulary=None)


model = LGBMClassifier(class_weight="balanced",random_state=SEED_VALUE)


# ros = RandomOverSampler(random_state=0)
# X_resampled, y_resampled = ros.fit_resample(X, y)
t_clf = imb_Pipeline([
     ('vectorizer', vectorizer),
    ('balancer', RandomOverSampler(random_state=SEED_VALUE)), # Doing this after TFIDF to not bias TFIDF weights
     ('lgbm', model), # this is class_weight='balanced'
    ])
                      
# see all parameters: https://lightgbm.readthedocs.io/en/latest/Parameters.html
# https://datascience.stackexchange.com/questions/108233/recommendations-for-tuning-xgboost-hyperparams

# get cross validation split indexes
iterable_train_val_indexes = []
for train_i, val_i in kf.split(X_train_text):
    iterable_train_val_indexes.append((train_i,val_i))

text_grid_search = GridSearchCV(t_clf, param_grid, cv=iterable_train_val_indexes, scoring=["f1"],refit="f1", n_jobs=-1)
text_grid_search.fit(X_train_text,y_train)


print(text_grid_search.best_score_, text_grid_search.best_params_)

# need to add estimator to parameter names 

# retrain with best params and evaluate on test set
results = save_results_df(text_grid_search,X_train_text,y_train,X_test_text,y_test,
                          model_name='lgbm_text',
                         output_dir=output_dir_i, ts = ts_i)

for k, v in results.get('lgbm_text').items():
    print(k) 
    display(v)
    print()

In [None]:
print(text_grid_search.best_score_, text_grid_search.best_params_)

# Text + Metadata model

In [None]:
toy = False

if toy:
    print('WARNING, running toy version')
    param_grid = {
       'preprocessor__text__vectorizer__max_features': [2000, None],
        'lgbm__min_child_samples': [10, 20,40], #alias: min_data_in_leaf

    }
else:
    param_grid = {
        'preprocessor__text__vectorizer__max_features': [2000, None],
        'lgbm__colsample_bytree': [1, 0.5, 0.1],
        'lgbm__max_depth': [-1,10,20], #-1 is the default and means No max depth
        'lgbm__min_child_weight': [0.01, 0.001, 0.0001],
        'lgbm__min_child_samples': [10, 20,40], #alias: min_data_in_leaf

    }

In [None]:
%%time


ts_i = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')

numeric_features = ['contact_size','reputation','type']
text_features = 'type_tag_content'

# get cross validation split indexes
iterable_train_val_indexes = []
for train_i, val_i in kf.split(X_train_metadata):
    iterable_train_val_indexes.append((train_i,val_i))
    
# Define pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

text_transformer = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(analyzer='word', binary=False,
                 decode_error='strict',
                 encoding='utf-8', input='content',
                 lowercase=True, max_df=0.8, max_features=20000,
                 min_df=3, ngram_range=(1, 2), norm='l2',
                 preprocessor=preprocess, smooth_idf=True,
                 stop_words=stopwords.words('english'), strip_accents='unicode',
                 sublinear_tf=True,
                 token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=custom_tokenizer, use_idf=True,
                 vocabulary=None))])
    
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('text', text_transformer, text_features)])
    
mnt_clf = Pipeline(steps=[('preprocessor', preprocessor),
('balancer', RandomOverSampler(random_state=SEED_VALUE)), # Doing this after TFIDF to not bias TFIDF weights
                      ('lgbm', LGBMClassifier(class_weight="balanced",random_state=SEED_VALUE, max_depth=20, min_child_samples=20, num_leaves=31, colsample_bytree=.95))
                     ])





meta_num_text_grid_search = GridSearchCV(mnt_clf, param_grid, cv=iterable_train_val_indexes, scoring=["f1"],refit="f1", n_jobs=-1)
meta_num_text_grid_search.fit(X_train_metadata,y_train)

# need to add estimator to parameter names 
best_params = {}
for k,v in meta_num_text_grid_search.best_params_.items():
    best_params['estimator__'+k] = v


# retrain with best params and evaluate on test set
results = save_results_df(meta_num_text_grid_search,best_params,X_train_metadata,y_train,X_test_metadata,y_test,
                          model_name='lgbm_metadata',
                         output_dir=output_dir_i, ts = ts_i)

for k, v in results.get('lgbm_metadata').items():
    print(k) 
    display(v)
    print()