# XGBoost Model to Classify Authors

In [1]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

In [2]:
# Load prebuilt featuresets
train_featureset = np.load('../data/05a_Train_Set_Features.npy', allow_pickle=True)
test_featureset = np.load('../data/05c_Test_Set_Features.npy', allow_pickle=True)

In [3]:
# Load original data to extract labels
train_data = pd.read_pickle('../data/05a_Authors_Train_Set.pkl')
test_data = pd.read_pickle('../data/05c_Authors_Test_Set.pkl')

In [4]:
# Extract labels
train_labels = list(train_data['label'])
test_labels = list(test_data['label'])

In [14]:
# define classifier
# are those parameters okay? binary seems not good
bst = XGBClassifier(n_estimators=10, max_depth=10, learning_rate=0.1, objective='multi:softmax', num_class=42) 
# fit model
bst.fit(train_featureset, train_labels)
# make predictions
preds = bst.predict(test_featureset)

In [18]:
from collections import Counter

Counter(preds)

Counter({0: 1157,
         23: 87,
         33: 188,
         8: 124,
         2: 388,
         39: 47,
         35: 654,
         30: 728,
         29: 1335,
         27: 612,
         36: 135,
         17: 131,
         40: 108,
         26: 1184,
         25: 17,
         41: 343,
         28: 21,
         22: 32,
         15: 93,
         1: 130,
         16: 340,
         38: 408,
         32: 418,
         14: 357,
         21: 122,
         18: 24,
         12: 821,
         20: 23,
         31: 47,
         19: 14,
         4: 8,
         34: 129,
         3: 5,
         10: 92,
         6: 44,
         11: 7,
         13: 18,
         37: 10,
         9: 2,
         24: 4,
         5: 1})

In [19]:
Counter(test_labels)

Counter({0.0: 192,
         1.0: 200,
         2.0: 223,
         3.0: 192,
         4.0: 195,
         5.0: 157,
         6.0: 188,
         7.0: 277,
         8.0: 409,
         9.0: 212,
         10.0: 209,
         11.0: 209,
         12.0: 363,
         13.0: 165,
         14.0: 454,
         15.0: 152,
         16.0: 151,
         17.0: 444,
         18.0: 180,
         19.0: 184,
         20.0: 231,
         21.0: 335,
         22.0: 189,
         23.0: 757,
         24.0: 163,
         25.0: 270,
         26.0: 272,
         27.0: 244,
         28.0: 192,
         29.0: 243,
         30.0: 226,
         31.0: 194,
         32.0: 184,
         33.0: 279,
         34.0: 391,
         35.0: 147,
         36.0: 164,
         37.0: 138,
         38.0: 266,
         39.0: 452,
         40.0: 162,
         41.0: 153})

In [15]:
from sklearn import metrics

accuracy = metrics.accuracy_score(test_labels, preds)
precision = metrics.precision_score(test_labels, preds, average='micro')
recall = metrics.recall_score(test_labels, preds, average='micro')
f_measure = metrics.f1_score(test_labels, preds, average='micro')
homogeneity = metrics.homogeneity_score(test_labels, preds)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-Measure: {f_measure}")
print(f"Homogeneity: {homogeneity}")

Accuracy: 0.14008455034588776
Precision: 0.14008455034588776
Recall: 0.14008455034588776
F-Measure: 0.14008455034588776
Homogeneity: 0.23471354776933648
