<a href="https://colab.research.google.com/github/feliciahf/data_science_exam/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XG Boost Model

From this article: https://suatatan.com/posts/sklearn_xgboost_tc/

In [1]:
# import relevant packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier

# accurcy
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

##The data

In [2]:
# import csv file as dataframe (from GitHub repo)
url = 'https://raw.githubusercontent.com/feliciahf/data_science_exam/main/hippoCorpusV2.csv'
df = pd.read_csv(url, encoding='latin1', delimiter=",")

# drop retold column (only using imagined and recalled)
df = df[df.memType != 'retold']

In [3]:
# make labels column using numerical values
df.memType = pd.Categorical(df.memType)
df['label'] = df.memType.cat.codes

# story type corresponding to label
print(f"Label 0: {df.loc[df['label'] == 0,'memType'].unique()}")
print(f"Label 1: {df.loc[df['label'] == 1,'memType'].unique()}")

Label 0: ['imagined']
Categories (1, object): ['imagined']
Label 1: ['recalled']
Categories (1, object): ['recalled']


In [4]:
# preprocessing
cv = CountVectorizer(max_features=5000, encoding="utf-8",  
      ngram_range = (1,3),  
      token_pattern = "[A-Za-z_][A-Za-z\d_]*")

# split into story features (X) and categories (y)
X = cv.fit_transform(df.story).toarray()
y = df['label']

# split into train and test data (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
      test_size=0.20,
      random_state=0)
count_df = pd.DataFrame(X_train, columns=cv.get_feature_names())
count_df['label'] = y_train

In [5]:
# fit model to training data
model = XGBClassifier()
model.fit(X_train, y_train)

# how well model does on training data
yhat = model.predict(X_train)
train_pred = [round(value) for value in yhat]
acc_train = accuracy_score(y_train, train_pred)
print("Accuracy on train data: %.2f%%" % (acc_train * 100.0))

# make predictions on test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions on test data
accuracy = accuracy_score(y_test, predictions)
print("Accuracy on test data: %.2f%%" % (accuracy * 100.0))

Accuracy on train data: 81.01%
Accuracy on test data: 69.65%


In [6]:
# compute overall accuracy, precision, recall, f1 scores (test data)
print('Accuracy: ', accuracy_score(y_test, predictions))
print('Precision: ', precision_score(y_test, predictions, average='weighted', zero_division=1))
print('Recall: ', recall_score(y_test, predictions, average='weighted', zero_division=1))
print('F1:', f1_score(y_test, predictions, average='weighted'))

Accuracy:  0.6964769647696477
Precision:  0.6975740718848265
Recall:  0.6964769647696477
F1: 0.6959199816523208


In [7]:
# print accuracy scores for each category
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.68      0.74      0.71       557
           1       0.71      0.65      0.68       550

    accuracy                           0.70      1107
   macro avg       0.70      0.70      0.70      1107
weighted avg       0.70      0.70      0.70      1107



In [8]:
# Matthews correlation coefficient
matthews_corrcoef(y_test, predictions)

0.39387216194769925