### Creating Train & Test 

In [1]:
#Libraries that will be used by both fastText & XGBoost
import pandas as pd
import time
import sys
sys.path.append("..")
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('../Data/train/Reuters/r52-train-all-terms.txt', header=None, sep='\t')
test = pd.read_csv('../Data/test/Reuters/r52-test-all-terms.txt', header=None, sep='\t')
train.columns = ['label', 'content']
test.columns = ['label', 'content']
test.head()

Unnamed: 0,label,content
0,trade,asian exporters fear damage from u s japan rif...
1,grain,china daily says vermin eat pct grain stocks a...
2,ship,australian foreign ship ban ends but nsw ports...
3,gold,western mining to open new gold mine in austra...
4,acq,sumitomo bank aims at quick recovery from merg...


### fastText (lr=1.0)

In [3]:
#Importing Libs that are useul to fastText

import os
from FasttextClassifier.FasttextClassifier import FasttextClassifier
from sklearn.model_selection import train_test_split
import random
import csv
from pathlib import Path

#Creating column with the label in fastText format

test['label'] = '__label__'+test['label']
train['label'] = '__label__'+train['label']


#saving train file
train.to_csv('trainFT.txt', sep='\t', header=None, index=False)
test.to_csv('testFT.txt', sep='\t', header=None, index=False)


#Start time measurement
start_time = time.time()

#Create model and Test

ft_model = FasttextClassifier(train_data='trainFT.txt')
fastText_test=ft_model.fasttext_test('testFT.txt')
fastText_accuracy= fastText_test[1]
print (fastText_accuracy)
fastText_execution_time= (time.time() - start_time)
print("--- %s seconds ---" % fastText_execution_time)

0.9256230529595015
--- 4.2410008907318115 seconds ---


In [None]:
execTime_FT = []
accFT_lst = []
for epoch in range(1, 100):
    start_time = time.time()
    ft_model = FasttextClassifier(train_data='trainFT.txt', epoch=epoch)
    fastText_execution_time = (time.time() - start_time)
    execTime_FT.append(fastText_execution_time)
    fastText_test = ft_model.fasttext_test('testFT.txt')
    fastText_accuracy = fastText_test[1]
    accFT_lst.append(fastText_accuracy)
    print (fastText_accuracy)
    print("--- %s seconds ---" % fastText_execution_time)

0.7710280373831776
--- 0.5479910373687744 seconds ---
0.8376168224299065
--- 0.652385950088501 seconds ---
0.8621495327102804
--- 0.4676952362060547 seconds ---
0.8820093457943925
--- 0.5669848918914795 seconds ---
0.889018691588785
--- 0.6625957489013672 seconds ---
0.8987538940809969
--- 0.6346862316131592 seconds ---
0.9014797507788161
--- 0.7328293323516846 seconds ---
0.9073208722741433
--- 0.8395016193389893 seconds ---
0.9131619937694704
--- 1.060866117477417 seconds ---
0.9158878504672897
--- 1.1804468631744385 seconds ---
0.919392523364486
--- 1.2246370315551758 seconds ---
0.9244548286604362
--- 1.3702881336212158 seconds ---
0.9283489096573209
--- 1.4455649852752686 seconds ---
0.9283489096573209
--- 1.5571932792663574 seconds ---
0.92601246105919
--- 1.544337272644043 seconds ---
0.9256230529595015
--- 1.7472760677337646 seconds ---
0.9310747663551402
--- 1.8911521434783936 seconds ---
0.9287383177570093
--- 1.966303825378418 seconds ---
0.9264018691588785
--- 2.06567120552

### fastText (lr=0.3)

In [None]:
fastText_test=ft_model.fasttext_test('testFT.txt')
execTime_FT03 = []
accFT_lst03 = []
for epoch in range(1, 100):
    start_time = time.time()
    ft_model = FasttextClassifier(train_data='trainFT.txt', epoch=epoch, lr=0.3)
    fastText_execution_time03 = (time.time() - start_time)
    execTime_FT03.append(fastText_execution_time03)
    fastText_test = ft_model.fasttext_test('testFT.txt')
    fastText_accuracy03 = fastText_test[1]
    accFT_lst03.append(fastText_accuracy03)
    print (fastText_accuracy03)
    print("--- %s seconds ---" % fastText_execution_time03)

### XGBoost (lr=1.0)

In [None]:
#Importing Libs that are useul to XGBoost

import xgboost as xgb
from glove.glovevectorizer import GloveVectorizer
from sklearn.metrics import mean_squared_error, accuracy_score

In [None]:
#Load word vectors

vectorizer = GloveVectorizer()
Xtrain = vectorizer.fit_transform(train.content) # get wordvectors


In [None]:
#Start time measurement

start_time = time.time()

xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.3,  n_estimators=10)
trainLabelLst = list(dict.fromkeys(train.label))
def get_label(label_str_lst, label_lst):
    return_lst = []
    for s1 in label_str_lst:
        for i, label in enumerate(label_lst):
            if s1 == label:
                return_lst.append(i)
    return return_lst


# get label index

train['label_idx'] = get_label(train['label'],trainLabelLst)
Ytrain = train.label_idx

data_dmmatrix= xgb.DMatrix(data=Xtrain,label=Ytrain)
param = {
    'max_depth': 10,
    'eta': 1.0,
    'objective': 'multi:softmax',
    'num_class': 52} 
epochs = 1

model = xgb.train(param, data_dmmatrix, epochs)
Xtest = vectorizer.transform(test.content)
test['label_idx'] = get_label(test['label'],trainLabelLst)
Ytest = test.label_idx
xgb_test = xgb.DMatrix(Xtest, label=Ytest)
predictions = model.predict(xgb_test)
XGBoost_accuracy=accuracy_score(Ytest, predictions)
print(XGBoost_accuracy)
XGBoost_time= model.predict(xgb_test)
XGBoost_execution_time= (time.time() - start_time)
print("--- %s seconds ---" % XGBoost_execution_time)

In [None]:
execTime_XG = []
accXG_lst = []
for epochs in range(1, 100):
    start_time = time.time()
    model = xgb.train(param, data_dmmatrix, epochs)
    XGBoost_execution_time = (time.time() - start_time)
    execTime_XG.append(XGBoost_execution_time)
    predictions = model.predict(xgb_test)
    XGBoost_accuracy = accuracy_score(Ytest, predictions)
    accXG_lst.append(XGBoost_accuracy)
    print(XGBoost_accuracy)
    print("--- %s seconds ---" % XGBoost_execution_time)

### XGBoost (lr=0.3)

In [None]:
xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.3,  n_estimators=10)
trainLabelLst = list(dict.fromkeys(train.label))
def get_label(label_str_lst, label_lst):
    return_lst = []
    for s1 in label_str_lst:
        for i, label in enumerate(label_lst):
            if s1 == label:
                return_lst.append(i)
    return return_lst


# get label index

train['label_idx'] = get_label(train['label'],trainLabelLst)
Ytrain = train.label_idx

data_dmmatrix= xgb.DMatrix(data=Xtrain,label=Ytrain)
param = {
    'max_depth': 10,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': 52} 
epochs = 1

model = xgb.train(param, data_dmmatrix, epochs)
Xtest = vectorizer.transform(test.content)
test['label_idx'] = get_label(test['label'],trainLabelLst)
Ytest = test.label_idx
xgb_test = xgb.DMatrix(Xtest, label=Ytest)

execTime_XG_lr03 = []
accXG_lst_lr03 = []
for epochs in range(1, 100):
    start_time = time.time()
    model = xgb.train(param, data_dmmatrix, epochs)
    XGBoost_execution_time = (time.time() - start_time)
    execTime_XG_lr03.append(XGBoost_execution_time)
    predictions = model.predict(xgb_test)
    XGBoost_accuracy = accuracy_score(Ytest, predictions)
    accXG_lst_lr03.append(XGBoost_accuracy)
    print(XGBoost_accuracy)
    print("--- %s seconds ---" % XGBoost_execution_time)

### Random Decision Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

model = RandomForestClassifier(n_estimators=100, 
                               random_state=42, 
                               max_features = 'sqrt',
                               n_jobs=None, verbose = 1)

train = pd.read_csv('../Data/train/Reuters/r52-train-all-terms.txt', header=None, sep='\t')
test = pd.read_csv('../Data/test/Reuters/r52-test-all-terms.txt', header=None, sep='\t')
train.columns = ['label', 'content']
test.columns = ['label', 'content']

vectorizer = GloveVectorizer()

Xtrain = vectorizer.fit_transform(train.content)
Ytrain = train.label
Xtest = vectorizer.fit_transform(test.content)
Ytest = test.label
# create the model, train it, print scores
model = RandomForestClassifier(n_estimators=1)
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

In [None]:
#Note: Number of estimators ≠ Number of epochs
execTime_RF = []
accRF_lst = []
for estimators in range(1, 100):
    start_time = time.time()
    model = RandomForestClassifier(n_estimators = estimators)
    model.fit(Xtrain, Ytrain)
    Random_Forest_execution_time = (time.time() - start_time)
    execTime_RF.append(Random_Forest_execution_time)
    Random_Forest_accuracy=model.score(Xtest, Ytest)
    accRF_lst.append(Random_Forest_accuracy)
    print(Random_Forest_accuracy)
    print("--- %s seconds ---" % Random_Forest_execution_time)

### Plots

In [None]:
xg=plt.plot(execTime_XG, accXG_lst, label='XGBoost(lr=1)')
xg_lr03=plt.plot(execTime_XG_lr03, accXG_lst_lr03, label='XGBoost(lr=0.3)')
ftext=plt.plot(execTime_FT, accFT_lst, label= "FastText(lr=1)")
ftext03=plt.plot(execTime_FT03, accFT_lst03, label= "FastText (lr=0.3)")
rf=plt.plot(execTime_RF,accRF_lst, label='Random Forest')
plt.legend()
plt.ylabel("Accuracy (%)")
plt.xlabel("Time (s)")
plt.title("Accuracy vs. Training Time")
#plt.savefig('accuracy_training.eps', format='eps')
plt.show()

In [None]:
markersize_set=8
xg=plt.scatter(execTime_XG, accXG_lst, label='XGBoost(lr=1)', s = markersize_set)
xg_lr03=plt.scatter(execTime_XG_lr03, accXG_lst_lr03, label='XGBoost(lr=0.3)', s = markersize_set)
ftext=plt.scatter(execTime_FT, accFT_lst, label= "FastText(lr=1)", s = markersize_set)
ftext03=plt.scatter(execTime_FT03, accFT_lst03, label= "FastText (lr=0.3)", s = markersize_set)
rf=plt.scatter(execTime_RF,accRF_lst, label='Random Forest', s = markersize_set)
plt.legend()
plt.ylabel("Accuracy (%)")
plt.xlabel("Time (s)")
plt.title("Accuracy vs. Training Time")
#plt.savefig('acctrain.eps', format='eps')
plt.show()


In [None]:
import numpy as np

x = execTime_XG
y = accXG_lst

x1 = execTime_XG_lr03
y1 = accXG_lst_lr03

x2 = execTime_FT
y2 = accFT_lst

x3 = execTime_FT03
y3 = accFT_lst03

x4 = execTime_RF
y4 = accRF_lst

# calculate polynomial
z = np.polyfit(x, y, 3)
f = np.poly1d(z)

z1 = np.polyfit(x1, y1, 3)
f1 = np.poly1d(z1)

z2 = np.polyfit(x2, y2, 3)
f2 = np.poly1d(z2)

z3 = np.polyfit(x3, y3, 3)
f3 = np.poly1d(z3)

z4 = np.polyfit(x4, y4, 3)
f4 = np.poly1d(z4)

# calculate new x's and y's
x_new = np.linspace(x[0], x[-1], 500)
y_new = f(x_new)

x1_new = np.linspace(x1[0], x1[-1], 500)
y1_new = f1(x1_new)

x2_new = np.linspace(x2[0], x2[-1], 500)
y2_new = f2(x2_new)

x3_new = np.linspace(x3[0], x3[-1], 500)
y3_new = f3(x3_new)

x4_new = np.linspace(x4[0], x4[-1], 500)
y4_new = f4(x4_new)

plt.ylabel("Accuracy (%)")
plt.xlabel("Time (s)")
plt.title("Accuracy vs. Training Time")



plt.plot(x,y ,'o', x_new, y_new, markersize=3 , label='XGBoost(lr=1)')
plt.plot(x1,y1 ,'o', x1_new, y1_new, markersize=3, label='XGBoost(lr=0.3)')
plt.plot(x2,y2 ,'o', x2_new, y2_new, markersize=3, label="FastText(lr=1)")
plt.plot(x3,y3 ,'o', x3_new, y3_new, markersize=3, label="FastText(lr=0.3)")
plt.plot(x4,y4 ,'o', x4_new, y4_new, markersize=3, label="Random Forest")
plt.legend()
plt.show()