## Python Notebook to train XGBoost model on both the original and augmented dataset

Here, we use the xgboost package but use the classifier built in the package that is built to integrate with sklearn at the cost of some functionality.
[Source](https://www.datacamp.com/tutorial/xgboost-in-python)

In [7]:
#Import necessary modules
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, csr_matrix, vstack
import pandas as pd
import numpy as np
import nltk
import xgboost as xgb
import matplotlib.pylab as pl
import lime
import lime.lime_tabular

## Configure NLTK if applicable

In [None]:
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')

In [21]:
# # obtained from https://gist.github.com/susanli2016/d35def30b99f0e2f56c0e01e19ad0878
# def gettop_n_bigram(corpus, n=None):
#     vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
#     bag_of_words = vec.transform(corpus)
#     sum_words = bag_of_words.sum(axis=0)
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:n]

In [3]:
def get_len(row):
    return len(row['Text'])

def get_len_aug(row):
    return len(row['text'])

In [4]:
# Perform feature engineering on the dataset
def feature_engineering(dataset, aug):
    if (aug):
        dataset['los'] = dataset.apply(get_len_aug, axis=1)
    else:
        dataset['los'] = dataset.apply(get_len, axis=1)

    # bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))
    # col_name = ""
    # if aug:
    #     col_name = 'text'
    # else:
    #     col_name = 'Text'
    # text_col = dataset[col_name]
    # new_col = bow_vectorizer.fit_transform(text_col)
    # dataset['bowvec'] = new_col
    return dataset

In [5]:
# Add the features has_swear_word, severity, topic to the augmented train set

def add_aug_features(dataframe, x_train):
    # Prepare a df with only the features we want to OHE
    ohe_df = dataframe[['has_swear_word', 'severity', 'topic']]
    encoded_df = pd.get_dummies(ohe_df)
    encoded_df[:10]

In [3]:
train_original = pd.read_csv('dataset/fulltrain.csv')
train_augment = pd.read_csv('dataset/merged_final_df_with_topics_new.csv')
test_original = pd.read_csv('dataset/balancedtest.csv')
test_augment = pd.read_csv('dataset/test_final_with_topics_new.csv')

bow_vectorizer_ori = CountVectorizer(stop_words='english', ngram_range=(1, 1))
bow_vectorizer_aug = CountVectorizer(stop_words='english', ngram_range=(1, 1))
# X_ori = bow_vectorizer_ori.fit_transform(train_original['Text'])
# X_aug = bow_vectorizer_aug.fit_transform(train_augment['text'])

# y_ori = train_original['Label']
# y_aug = train_augment['label']

X_train_ori = bow_vectorizer_ori.fit_transform(train_original['Text'])
X_train_aug = bow_vectorizer_aug.fit_transform(train_augment['text'])

y_train_ori = train_original['Label']
y_train_aug = train_augment['label']

X_test_ori = bow_vectorizer_ori.transform(test_original['Text'])
X_test_aug = bow_vectorizer_aug.transform(test_augment['text'])

y_test_ori = test_original['Label']
y_test_aug = test_augment['label']

# X_train_ori, X_test_ori, y_train_ori, y_test_ori = train_test_split(X_ori, y_ori, test_size=0.20, random_state=42)
# X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_aug, y_aug, test_size=0.20, random_state=42)
train_augment['label']

0        1
1        1
2        1
3        1
4        1
        ..
59790    4
59791    4
59792    4
59793    4
59794    4
Name: label, Length: 59795, dtype: int64

In [4]:
# ohe_df = train_augment[['has_swear_word', 'severity', 'topic']]
encoded_df = pd.get_dummies(train_augment, columns = ['has_swear_word', 'severity', 'topic'])
dropped_df = encoded_df.drop(['label', 'text', 'processed_text'], axis=1)
sparse = csr_matrix(dropped_df)
sparse_2 = csr_matrix(X_train_aug)
X_train_combined = hstack([sparse_2, sparse])

encoded_df = pd.get_dummies(test_augment, columns = ['has_swear_word', 'severity', 'topic'])
dropped_df = encoded_df.drop(['label', 'text', 'processed_text'], axis=1)
sparse = csr_matrix(dropped_df)
sparse_2 = csr_matrix(X_test_aug)
X_test_combined = hstack([sparse_2, sparse])

def pad_columns(matrix1, matrix2):
    matrix1_rows = matrix1.shape[0]
    matrix2_rows = matrix2.shape[0]
    matrix1_cols = matrix1.shape[1]
    matrix2_cols = matrix2.shape[1]
    diff = matrix1_cols - matrix2_cols
    if (diff < 0):
        # Need to pad columns to matrix 1
        diff = diff * -1
        zero_matrix = csr_matrix((matrix1_rows, diff))
        matrix1 = hstack([matrix1, zero_matrix])
    elif (diff > 0):
        # Need to pad columns to matrix 2
        zero_matrix = csr_matrix((matrix2_rows, diff))
        matrix2 = hstack([matrix2, zero_matrix])
    return (matrix1, matrix2)

X_train_combined, X_test_combined = pad_columns(X_train_combined, X_test_combined)
display(X_train_combined)
display(X_test_combined)

<59795x228982 sparse matrix of type '<class 'numpy.int64'>'
	with 10614690 stored elements in Compressed Sparse Row format>

<3000x228982 sparse matrix of type '<class 'numpy.float64'>'
	with 549286 stored elements in Compressed Sparse Row format>

In [164]:
#Run this if you need to modify X_train again for some reason
# X_train = train_augment['topic']
# print(dropped_df)
display(X_test_aug.shape)
display(X_train_aug.shape)
# print(X_test_ori.shape)
# print(train_original[:5])
# print(test_original[:5])

(3000, 228864)

(59795, 228864)

In [7]:
# Hyperparameter tuning for xgb model

# First, we do a train_test_split on the original dataset
bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))
X_ori = bow_vectorizer.fit_transform(train_original['Text'])
y_ori = train_original['Label']
X_train, X_test, y_train, y_test = train_test_split(X_ori, y_ori, test_size=0.20, random_state=42)

xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', tree_method='hist', enable_categorical=True, max_depth=3, n_estimators=500)
# params = {'n_estimators': [1, 10, 25, 50, 100, 200], 'eta': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'max_depth': [2, 3, 4, 5, 10, 20]}
params = {'n_estimators': [700, 800 ,900]}

le = LabelEncoder()
y_train = le.fit_transform(y_train)

# Yes this is probably not enough, but the dataset is large, and I don't have all day.
cv = RepeatedKFold(n_splits=2, n_repeats=1)

clf = GridSearchCV(xgb_classifier, params, cv=cv, verbose=2)
xgb_opt = clf.fit(X_train, y_train)

print("optimal_param: ", xgb_opt.best_estimator_.get_params()['n_estimators'])
# print("optimal_param: ", xgb_opt.best_estimator_.get_params()['eta'])

KeyboardInterrupt: 

### Hyperparameter tuning notes
For the XGBoost model, I decided to stick to tree ensemble methods (might need to justify, but it is the default so it is probably the more general one). Hence, there are 3 main hyperparameters to tune:  
1. n_estimators: Number of estimators used in the ensemble model.
1. eta: The learning rate.
1. max_depth: The maximum depth of each individual tree model.

Hyperparameter tuning was done on the training data with an 80:20 test split. Sklearn's GridSearchCV was used to automate the process.
In the end, the values of the hyperparameters we arrived at was:
1. n_estimators: 700
1. eta: 0.5
1. max_depth: 5

In [5]:
# Tuned model
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', tree_method='hist', enable_categorical=True, max_depth = 5, n_estimators=700, eta=0.5)

# The label encoder is necessary as XGBClassifier expects labels [0,1,2,3] but we have [1,2,3,4]
le = LabelEncoder()
y_train_ori = le.fit_transform(y_train_ori)
y_test_ori = le.fit_transform(y_test_ori)
y_train_aug = le.fit_transform(y_train_aug)
y_test_aug = le.fit_transform(y_test_aug)

# Original dataset
xgb_classifier.fit(X_train_ori, y_train_ori)
y_pred_ori = xgb_classifier.predict(X_test_ori)
print('Original dataset:\n')
print(classification_report(y_test_ori, y_pred_ori))

# Augmented dataset
xgb_classifier.fit(X_train_combined, y_train_aug)
y_pred_aug = xgb_classifier.predict(X_test_combined)
print('Augmented dataset:\n')
print(classification_report(y_test_aug, y_pred_aug))

Original dataset:

              precision    recall  f1-score   support

           0       0.72      0.69      0.70       750
           1       0.71      0.52      0.60       750
           2       0.55      0.50      0.52       750
           3       0.66      0.92      0.77       750

    accuracy                           0.66      3000
   macro avg       0.66      0.66      0.65      3000
weighted avg       0.66      0.66      0.65      3000

Augmented dataset:

              precision    recall  f1-score   support

           0       0.76      0.68      0.72       750
           1       0.70      0.55      0.61       750
           2       0.48      0.38      0.42       750
           3       0.60      0.92      0.73       750

    accuracy                           0.63      3000
   macro avg       0.64      0.63      0.62      3000
weighted avg       0.64      0.63      0.62      3000



### Notes

XGBoost classifier (Sklearn version)
As a baseline for the "random" f1 score for the original dataset, I trained the XGBoost model on an X_train that was just the length of the text string. This f1 score turned out to be 0.06729. This is expected, and it just means that any meaningful features will produce an F1 score higher than this.<br>
Doing the same for the augmented dataset yields an F1 score of 0.1285. This improvement does not necessarily mean that the augmented dataset is "better", but rather that this is the base that any meaningful feature needs to beat.

We tried converting each of the text into a bag of words vector and training the XGB classifier on it. The f1 score obtained was 0.8765 for the original dataset, with an accuracy of 0.8851 and precision of 0.8935. Doing the same for the augmented dataset, the f1 score obtained was 0.8731, with accuracy 0.8759 and precision 0.8742. Although the model seems to do poorer on the augmented dataset, the discrepancy is minimal.<br>
Here, I think it is safe to conclude that when it comes to this particular feature, adding the new rows to the dataset does not affect the performance of the XGBoost classifier model.

In [152]:
# Generate CSV

test_original = pd.read_csv('dataset/balancedtest.csv')
test_augment = pd.read_csv('dataset/test_final_with_topics_new.csv')

test_original['predicted_label'] = y_pred_ori
test_augment['predicted_label'] = y_pred_aug

test_augment_dropped = test_augment.drop(['has_swear_word', 'severity', 'processed_text', 'topic'], axis=1)

test_original.to_csv('XGBoost_original.csv', index=False)
test_augment_dropped.to_csv('XGBoost_augment.csv', index=False)

display(test_augment_dropped)

# print(len(y_pred_ori))
# print(len(y_pred_aug))

Unnamed: 0,label,text,predicted_label
0,1,When so many actors seem content to churn out ...,0
1,1,In what football insiders are calling an unex...,0
2,1,In a freak accident following Game 3 of the N....,0
3,1,North Koreas official news agency announced to...,0
4,1,The former Alaska Governor Sarah Palin would b...,3
...,...,...,...
2995,4,The Air Force mistakenly gave rival companies ...,3
2996,4,The United Nations climate chief on Friday cha...,3
2997,4,River Plate midfielder Diego Buonanotte has un...,3
2998,4,Lawmakers were on the brink Tuesday of exempti...,3


In [6]:
# Downsample because shap was crashing my kernel
X_train_part = X_train_ori[:1000]

In [11]:
# Interpretability of the model using LIME

errors = y_pred_ori - y_test_ori
sorted_errors = np.argsort(abs(errors))
worse_5 = sorted_errors[-5:]
best_5 = sorted_errors[:5]
display(worse_5)
display(best_5)

array([2716, 2701, 2970, 2997, 2918])

array([12, 32, 33, 34, 35])