## Python Notebook to train XGBoost model on both the original and augmented dataset

In [1]:
#Import necessary modules
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import nltk

## Configure NLTK if applicable

In [None]:
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')

In [2]:
# obtained from https://gist.github.com/susanli2016/d35def30b99f0e2f56c0e01e19ad0878
def gettop_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [3]:
def get_len(row):
    return len(row['text'])

In [4]:
# Perform feature engineering on the dataset
def feature_engineering(dataset, column_name):
    dataset['los'] = dataset.apply(get_len, axis=1)
    return dataset

In [16]:
import csv

train_original = pd.read_csv('D:/!Education/CS4248/Project/fulltrain.csv')
train_augment = pd.read_csv('D:/!Education/CS4248/Project/merged_final_df_with_topics.csv')
X_train = feature_engineering(train_augment, ')')['los'].to_frame()

# # Define the header
# header = ['Label']

# # Read the contents of the file
# with open('D:/!Education/CS4248/Project/fulltrain.csv', 'r', newline='') as csvfile1:
#     rows = list(csv.reader(csvfile1))

# # Add the header
# rows.insert(0, header)

# # Write back to the CSV file
# with open('D:/!Education/CS4248/Project/fulltrain.csv', 'w', newline='') as csvfile1:
#     csvwriter = csv.writer(csvfile1)
#     csvwriter.writerows(rows)



# # Read the contents of the file
# with open('D:/!Education/CS4248/Project/merged_final_df_with_topics.csv', 'r', newline='') as csvfile2:
#     rows = list(csv.reader(csvfile2))

# # Add the header
# rows.insert(0, header)

# # Write back to the CSV file
# with open('D:/!Education/CS4248/Project/merged_final_df_with_topics.csv', 'w', newline='') as csvfile2:
#     csvwriter = csv.writer(csvfile2)
#     csvwriter.writerows(rows)


y_train = train_original['Label']
y_train = train_augment['label']
train_augment[:100]

Unnamed: 0,label,text,has_swear_word,severity,processed_text,topic,los
0,1,"A little less than a decade ago, hockey fans w...",False,0.0,"['little', 'less', 'decade', 'ago', 'hockey', ...",0,873
1,1,The writers of the HBO series The Sopranos too...,False,0.0,"['writers', 'hbo', 'series', 'sopranos', 'took...",0,715
2,1,Despite claims from the TV news outlet to offe...,False,0.0,"['despite', 'claims', 'tv', 'news', 'outlet', ...",0,4443
3,1,After receiving 'subpar' service and experienc...,False,0.0,"['receiving', 'subpar', 'service', 'experienci...",4,3913
4,1,After watching his beloved Seattle Mariners pr...,False,0.0,"['watching', 'beloved', 'seattle', 'mariners',...",0,1058
...,...,...,...,...,...,...,...
95,1,"An estimated 300 naked women, including actres...",True,1.0,"['estimated', '300', 'naked', 'women', 'includ...",0,654
96,1,"Across the U.S., ceremonies have already begun...",False,0.0,"['across', 'us', 'ceremonies', 'already', 'beg...",0,520
97,1,Describing himself as a complete anomaly withi...,False,0.0,"['describing', 'complete', 'anomaly', 'within'...",0,866
98,1,In what some economists believe to be a sign t...,False,0.0,"['economists', 'believe', 'sign', 'us', 'could...",4,3884


In [17]:
#Run this if you need to modify X_train again for some reason
# X_train = train_augment['topic']
print(X_train)

        los
0       873
1       715
2      4443
3      3913
4      1058
...     ...
59790  1996
59791  1146
59792  1665
59793  1138
59794  3296

[59795 rows x 1 columns]


In [18]:
model = GradientBoostingClassifier()
X_train_formatted = np.array(X_train).reshape(-1, 1)
# X_train_formatted = np.array(X_train)
model.fit(X_train_formatted, y_train)
y_pred = model.predict(X_train_formatted)
f1_score(y_train, y_pred, average='macro')

0.5086141748958363

### Notes

As a baseline for the "random" f1 score for the original dataset, I trained the XGBoost model on an X_train that was just the length of the text string. This f1 score turned out to be 0.5234, which is surprisingly high and not much less than the f1 score in the paper.
Doing the same for the augmented dataset yields an F1 score of 0.5086, which is lower but is expected as the length of the text string should theoretically have zero correlation with the label.