In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from gensim.models import KeyedVectors, Word2Vec
from matplotlib import pyplot as plt
import gensim
import re
from xgboost import XGBClassifier
import pickle

In [2]:
df = pd.read_csv('data/tokenized/in_domain_train.tsv', sep='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [3]:
df.head()

Unnamed: 0,sentence_source,label,label_notes,sentence
0,gj04,1,,"our friends wo n't buy this analysis , let alo..."
1,gj04,1,,one more pseudo generalization and i 'm giving...
2,gj04,1,,one more pseudo generalization or i 'm giving ...
3,gj04,1,,"the more we study verbs , the crazier they get ."
4,gj04,1,,day by day the facts are getting murkier .


In [4]:
df['sentence'] = df['sentence'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+",' ', x))
df['tokens'] = df['sentence'].apply(lambda x: x.strip().split(' '))

# Feature Engineering

In [65]:
def featureVecMethod(words, mdoel, num_features):
    featureVec = np.zeros(num_features, dtype='float32')
    nwords = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
        
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatures(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    for review in reviews:
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter + 1
        
    return reviewFeatureVecs

In [54]:
sentences = [x.strip().split() for x in df['sentence'].values]

In [55]:
model = Word2Vec(
    sentences,
    workers=4,
    size=300,
    sample=1e-3
)

model.init_sims(replace=True)

In [11]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=2021)

In [62]:
clean_train_data = train_df['sentence'].apply(lambda x: x.split()).values

In [72]:
X_train = getAvgFeatures(clean_train_data, model, 300)
y_train = train_df['label'].values

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


In [85]:
xbg_cls = XGBClassifier(
    max_depth=9, n_estimators=5000, 
).fit(
    X_train, y_train,
    verbose=True
)

# Evaluate Model

In [86]:
clean_test_data = test_df['sentence'].apply(lambda x: x.split())
x_test = getAvgFeatures(clean_test_data, model, 300)
y_pred = xbg_cls.predict(x_test)

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


In [87]:
y_true = test_df['label'].values

In [88]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.35      0.21      0.26       470
           1       0.74      0.85      0.79      1241

    accuracy                           0.68      1711
   macro avg       0.55      0.53      0.53      1711
weighted avg       0.63      0.68      0.65      1711



In [89]:
# Write to pickle
with open('model/xgb.pickle', 'wb') as file:
    pickle.dump(xbg_cls, file)