# Proceedings classification

### Setup

In [1]:
%run -i 'random_state.py'
from packages import *
from clean_functions import *
from tokenizer import *
from fit_models import *

Dictionary to store results

In [2]:
results={}
results['lstm']={}
results['mlp']={}
results['xgboost']={}

In [3]:
hypers=[]

#pd.DataFrame(index=index, columns=['modelo','neurons', 'lamb1', 'lamb2'])
#hypers_nn=pd.DataFrame({'modelo':[None],'neurons':[None], 'lamb1':[None], 'lamb2':[None]})

# LSTM

## W2V/CNN/LSTM

In [4]:
X, y=np.load('data/X_w2v.npy'),np.load('data/y_w2v.npy')

In [5]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}
#
for i in range(len(y)):
    y[i]=encode[y[i]]

In [6]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

y_train2=np.array(pd.get_dummies(y_train))
y_val2=np.array(pd.get_dummies(y_val))
y_test2=np.array(pd.get_dummies(y_test))

np.shape(X_train),np.shape(y_train)

((4514, 5, 70, 100), (4514,))

In [7]:
hyper=pd.read_csv('hyper/hyper_lstm_w2v')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()

In [8]:
hypers.append(['lstm_w2v', hyper.loc[:,'ks'][0], hyper.loc[:,'neurons'][0], hyper.loc[:,'lamb1'][0], hyper.loc[:,'lamb2'][0],'-','-','-','-'])

In [9]:
num_classes=3
Adam=optimizers.Adam(learning_rate=0.005, beta_1=0.9, beta_2=0.999, amsgrad=True)

#Cleaning session
tensorflow.keras.backend.clear_session()
 
#Hyper
k=hyper.loc[0,'ks']
neuron=hyper.loc[0,'neurons']
lamb1=hyper.loc[0,'lamb1']
lamb2=hyper.loc[0,'lamb2']

#Cleaning session
tensorflow.keras.backend.clear_session()

#Model for features extraction
inputs = Input(shape=np.shape(X_train)[1:])
conv = TimeDistributed(Conv1D(k, 1, activation='linear', kernel_constraint=unit_norm(axis=1), use_bias=False))(inputs)
pool = TimeDistributed(GlobalMaxPooling1D())(conv)
model_feat = Model(inputs, pool)

#Model for classification
pooled_inputs = Input(shape=(5, k))
lstm = LSTM(neuron, kernel_regularizer=regularizers.l1_l2(lamb1, lamb2))(pooled_inputs)
soft = Dense(num_classes, activation='softmax', kernel_regularizer=regularizers.l1_l2(lamb1, lamb2))(lstm)
model_classific = Model(pooled_inputs, soft)

#Final model
outputs = model_classific(model_feat(inputs))
model_cnn = Model(inputs, outputs)

#Compiling
model_cnn.compile(loss='categorical_crossentropy', optimizer=Adam, metrics=['accuracy'])

#Running
modelo=model_cnn.fit(X_train, y_train2, epochs=50,
                                              batch_size=500,
                                              shuffle=True,
                                              verbose=False,
                                              validation_data=(X_val, y_val2))

In [10]:
model_cnn.save('models/model_lstm_w2v.h5')
model_feat.save('models/model_feat_lstm_w2v.h5')
model_classific.save('models/model_classific_lstm_w2v.h5')

In [11]:
results['lstm']['w2v']=bootstrap(X_test, y_test, model_cnn)

100%|██████████| 100/100 [00:53<00:00,  1.88it/s]


## BERT/LSTM

In [12]:
X, y=np.load('data/X_bert.npy'),np.load('data/y_bert.npy')

In [13]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [14]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

y_train2=np.array(pd.get_dummies(y_train))
y_val2=np.array(pd.get_dummies(y_val))
y_test2=np.array(pd.get_dummies(y_test))

np.shape(X_train),np.shape(y_train)

((4514, 5, 768), (4514,))

In [15]:
hyper=pd.read_csv('hyper/hyper_lstm_bert')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_lstm(hyper, X_train, y_train2, X_val, y_val2)

In [16]:
hypers.append(['lstm_bert', '-', hyper.loc[:,'neurons'][0], hyper.loc[:,'lamb1'][0], hyper.loc[:,'lamb2'][0],'-','-','-','-'])

In [17]:
results['lstm']['bert']=bootstrap(X_test, y_test, model)

100%|██████████| 100/100 [00:13<00:00,  7.20it/s]


## Doc2Vec/LSTM

In [18]:
X, y=np.load('data/X_d2v.npy'),np.load('data/y_d2v.npy')

In [19]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [20]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

y_train2=np.array(pd.get_dummies(y_train))
y_val2=np.array(pd.get_dummies(y_val))
y_test2=np.array(pd.get_dummies(y_test))

np.shape(X_train),np.shape(y_train)

((4514, 5, 100), (4514,))

In [21]:
hyper=pd.read_csv('hyper/hyper_lstm_d2v')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_lstm(hyper, X_train, y_train2, X_val, y_val2)

In [22]:
hypers.append(['lstm_d2v',  '-', hyper.loc[:,'neurons'][0], hyper.loc[:,'lamb1'][0], hyper.loc[:,'lamb2'][0],'-','-','-','-'])

In [23]:
results['lstm']['d2v']=bootstrap(X_test, y_test, model)

100%|██████████| 100/100 [00:07<00:00, 13.32it/s]


## TFIDF/LSTM

In [24]:
X, y=np.load('data/X_tfidf.npy'),np.load('data/y_tfidf.npy')

In [25]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [26]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

y_train2=np.array(pd.get_dummies(y_train))
y_val2=np.array(pd.get_dummies(y_val))
y_test2=np.array(pd.get_dummies(y_test))

np.shape(X_train),np.shape(y_train)

((4514, 5, 4000), (4514,))

In [27]:
hyper=pd.read_csv('hyper/hyper_lstm_tfidf')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_lstm(hyper, X_train, y_train2, X_val, y_val2)

In [28]:
hypers.append(['lstm_tfidf',  '-', hyper.loc[:,'neurons'][0], hyper.loc[:,'lamb1'][0], hyper.loc[:,'lamb2'][0],'-','-','-','-'])

In [29]:
results['lstm']['tfidf']=bootstrap(X_test, y_test, model)

100%|██████████| 100/100 [00:46<00:00,  2.15it/s]


# MLP

## W2V/MLP

In [30]:
X, y=np.load('data/X_w2v.npy'),np.load('data/y_w2v.npy')

In [31]:
N=np.sum(X[:,:,:,0]!=0, axis=2)
N=np.expand_dims(N, axis=2)
N[N==0]=1
X=np.sum(X, axis=2)
X=X/N
X=X.reshape((X.shape[0],-1))

In [32]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [33]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

y_train2=np.array(pd.get_dummies(y_train))
y_val2=np.array(pd.get_dummies(y_val))
y_test2=np.array(pd.get_dummies(y_test))

np.shape(X_train),np.shape(y_train)

((4514, 500), (4514,))

In [34]:
hyper=pd.read_csv('hyper/hyper_mlp_w2v')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_mlp(hyper, X_train, y_train2, X_val, y_val2)

In [35]:
hypers.append(['mlp_w2v',  '-', hyper.loc[:,'neurons'][0], hyper.loc[:,'lamb1'][0], hyper.loc[:,'lamb2'][0],'-','-','-','-'])

In [36]:
results['mlp']['w2v']=bootstrap(X_test, y_test, model)

100%|██████████| 100/100 [00:05<00:00, 19.38it/s]


## BERT/MLP

In [37]:
X, y = np.load('data/X_bert.npy'),np.load('data/y_bert.npy')

In [38]:
X=X.reshape((X.shape[0],-1))

In [39]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [40]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

y_train2=np.array(pd.get_dummies(y_train))
y_val2=np.array(pd.get_dummies(y_val))
y_test2=np.array(pd.get_dummies(y_test))

np.shape(X_train),np.shape(y_train)

((4514, 3840), (4514,))

In [41]:
hyper=pd.read_csv('hyper/hyper_mlp_bert')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_mlp(hyper, X_train, y_train2, X_val, y_val2)

In [42]:
hypers.append(['mlp_bert',  '-', hyper.loc[:,'neurons'][0], hyper.loc[:,'lamb1'][0], hyper.loc[:,'lamb2'][0],'-','-','-','-'])

In [43]:
results['mlp']['bert']=bootstrap(X_test, y_test, model)

100%|██████████| 100/100 [00:11<00:00,  8.35it/s]


## Doc2Vec/MLP

In [44]:
X, y=np.load('data/X_d2v.npy'),np.load('data/y_d2v.npy')

In [45]:
X=X.reshape((X.shape[0],-1))

In [46]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [47]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

y_train2=np.array(pd.get_dummies(y_train))
y_val2=np.array(pd.get_dummies(y_val))
y_test2=np.array(pd.get_dummies(y_test))

np.shape(X_train),np.shape(y_train)

((4514, 500), (4514,))

In [48]:
hyper=pd.read_csv('hyper/hyper_mlp_d2v')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_mlp(hyper, X_train, y_train2, X_val, y_val2)

In [49]:
hypers.append(['mlp_d2v',  '-', hyper.loc[:,'neurons'][0], hyper.loc[:,'lamb1'][0], hyper.loc[:,'lamb2'][0],'-','-','-','-'])

In [50]:
results['mlp']['d2v']=bootstrap(X_test, y_test, model)

100%|██████████| 100/100 [00:05<00:00, 19.51it/s]


## TFIDF/MLP

In [51]:
X, y=np.load('data/X_tfidf.npy'),np.load('data/y_tfidf.npy')

In [52]:
X=X.reshape((X.shape[0],-1))

In [53]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [54]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

y_train2=np.array(pd.get_dummies(y_train))
y_val2=np.array(pd.get_dummies(y_val))
y_test2=np.array(pd.get_dummies(y_test))

np.shape(X_train),np.shape(y_train)

((4514, 20000), (4514,))

In [55]:
hyper=pd.read_csv('hyper/hyper_mlp_tfidf')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_mlp(hyper, X_train, y_train2, X_val, y_val2)

In [56]:
hypers.append(['mlp_tfidf',  '-', hyper.loc[:,'neurons'][0], hyper.loc[:,'lamb1'][0], hyper.loc[:,'lamb2'][0],'-','-','-','-'])

In [57]:
results['mlp']['tfidf']=bootstrap(X_test, y_test, model)

100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


# XGBoost

## W2V/XGBoost


In [58]:
X, y=np.load('data/X_w2v.npy'),np.load('data/y_w2v.npy')

In [59]:
N=np.sum(X[:,:,:,0]!=0, axis=2)
N=np.expand_dims(N, axis=2)
N[N==0]=1
X=np.sum(X, axis=2)
X=X/N
X=X.reshape((X.shape[0],-1))

In [60]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [61]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

np.shape(X_train),np.shape(y_train)

((4514, 500), (4514,))

In [62]:
hyper=pd.read_csv('hyper/hyper_xgboost_w2v')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_xgboost(hyper, X_train, y_train, X_val, y_val, random_seed)

In [63]:
hypers.append(['xgboost_w2v', '-', '-','-','-', hyper.loc[:,'max_depth'][0], 
               hyper.loc[:,'learning_rate'][0], hyper.loc[:,'gamma'][0], hyper.loc[:,'reg_lambda'][0]])

In [64]:
results['xgboost']['w2v']=bootstrap(X_test, y_test, model, nn=False)

100%|██████████| 100/100 [00:00<00:00, 113.07it/s]


## BERT/XGBoost


In [65]:
X, y = np.load('data/X_bert.npy'),np.load('data/y_bert.npy')

In [66]:
X=X.reshape((X.shape[0],-1))

In [67]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [68]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

np.shape(X_train),np.shape(y_train)

((4514, 3840), (4514,))

In [69]:
hyper=pd.read_csv('hyper/hyper_xgboost_bert')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_xgboost(hyper, X_train, y_train, X_val, y_val, random_seed)

In [70]:
hypers.append(['xgboost_bert', '-', '-','-','-', hyper.loc[:,'max_depth'][0], 
               hyper.loc[:,'learning_rate'][0], hyper.loc[:,'gamma'][0], hyper.loc[:,'reg_lambda'][0]])

In [71]:
results['xgboost']['bert']=bootstrap(X_test, y_test, model, nn=False)

100%|██████████| 100/100 [00:06<00:00, 16.28it/s]


## Doc2Vec/XGBoost

In [72]:
X, y=np.load('data/X_d2v.npy'),np.load('data/y_d2v.npy')

In [73]:
X=X.reshape((X.shape[0],-1))

In [74]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [75]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

np.shape(X_train),np.shape(y_train)

((4514, 500), (4514,))

In [76]:
hyper=pd.read_csv('hyper/hyper_xgboost_d2v')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_xgboost(hyper, X_train, y_train, X_val, y_val, random_seed)

In [77]:
hypers.append(['xgboost_d2v', '-', '-','-','-', hyper.loc[:,'max_depth'][0], 
               hyper.loc[:,'learning_rate'][0], hyper.loc[:,'gamma'][0], hyper.loc[:,'reg_lambda'][0]])

In [78]:
results['xgboost']['d2v']=bootstrap(X_test, y_test, model, nn=False)

100%|██████████| 100/100 [00:00<00:00, 110.58it/s]



## TFIDF/XGBoost

In [79]:
X, y=np.load('data/X_tfidf.npy'),np.load('data/y_tfidf.npy')

In [80]:
X=X.reshape((X.shape[0],-1))

In [81]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

In [82]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

np.shape(X_train),np.shape(y_train)

((4514, 20000), (4514,))

In [83]:
hyper=pd.read_csv('hyper/hyper_xgboost_tfidf')
hyper=hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(1).reset_index()
model=model_xgboost(hyper, X_train, y_train, X_val, y_val, random_seed)

In [84]:
hypers.append(['xgboost_tfidf', '-', '-','-','-', hyper.loc[:,'max_depth'][0], 
               hyper.loc[:,'learning_rate'][0], hyper.loc[:,'gamma'][0], hyper.loc[:,'reg_lambda'][0]])

In [85]:
results['xgboost']['tfidf']=bootstrap(X_test, y_test, model, nn=False)

100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


# Latex tables

Hyperparameters

In [88]:
hypers=pd.DataFrame(hypers, columns=['model','ks','neurons','lamb1','lamb2','max_depth','learning_rate','gamma','reg_lambda'])

In [89]:
print("\\begin{table*}[h] \n",
      "\centering \n",
      '\caption{Best values for hyperparameters} \n',
      ' \\begin{tabular}{c|c|cccc|cccc|c} \n',
      ' \\hline \n',
      ' Classifier & Feature extraction & Filters & Hidden Size & Reg l1 & Reg l2 & Max. depth & Learning rate & Gamma & Lambda & \\\ \n',
      ' \\hline')

i=0
for c in ['LSTM','MLP','XGboost']:
    print('\n\\multirow{4}{*}{'+c+'}')
    
    for f in ['W2V','Doc2Vec','TFIDF','BERT']:
        
        print('& '+f+' & '+str(hypers.iloc[i,1])+' &' +str(hypers.iloc[i,2])+' & '+str(hypers.iloc[i,3])+' & '+str(hypers.iloc[i,4])+' & '+str(hypers.iloc[i,5])+' & '+str(hypers.iloc[i,6])+' & '+str(hypers.iloc[i,7])+' & '+str(hypers.iloc[i,8]),"\\\\")
        
        i=i+1
    print('\\hline \n')
    
print(' \end{tabular}% \n',
      '\label{tab:hyper}% \n',
   '\end{table*}% \n')

\begin{table*}[h] 
 \centering 
 \caption{Best values for hyperparameters} 
  \begin{tabular}{c|c|cccc|cccc|c} 
  \hline 
  Classifier & Feature extraction & Filters & Hidden Size & Reg l1 & Reg l2 & Max. depth & Learning rate & Gamma & Lambda & \\ 
  \hline

\multirow{4}{*}{LSTM}
& W2V & 9 &100 & 0.0005 & 0.0001 & - & - & - & - \\
& Doc2Vec & - &25 & 5e-06 & 1e-05 & - & - & - & - \\
& TFIDF & - &25 & 0.0001 & 1e-06 & - & - & - & - \\
& BERT & - &150 & 0.001 & 0.0005 & - & - & - & - \\
\hline 


\multirow{4}{*}{MLP}
& W2V & - &200 & 0.0001 & 1e-05 & - & - & - & - \\
& Doc2Vec & - &200 & 0.0001 & 0.0001 & - & - & - & - \\
& TFIDF & - &200 & 0.0001 & 1e-06 & - & - & - & - \\
& BERT & - &50 & 0.0005 & 1e-05 & - & - & - & - \\
\hline 


\multirow{4}{*}{XGboost}
& W2V & - &- & - & - & 5 & 1.0 & 1e-05 & 300.0 \\
& Doc2Vec & - &- & - & - & 5 & 1.0 & 0.01 & 50.0 \\
& TFIDF & - &- & - & - & 7 & 1.0 & 0.01 & 300.0 \\
& BERT & - &- & - & - & 5 & 0.5 & 0.0001 & 10.0 \\
\hline 

 \end{tabular}% 
 \

Comparing models

In [86]:
for i in set(results):
    for j in set(results[i]):
        results[i][j]['accuracy'][0]=np.round(results[i][j]['accuracy'][0], 2)
        results[i][j]['accuracy'][1]=np.round(results[i][j]['accuracy'][1], 2)
        for k in ['macro avg', 'weighted avg']:
            results[i][j][k]['f1-score'][0]=np.round(results[i][j][k]['f1-score'][0], 2)
            results[i][j][k]['f1-score'][1]=np.round(results[i][j][k]['f1-score'][1], 2)
            results[i][j][k]['precision'][0]=np.round(results[i][j][k]['precision'][0], 2)
            results[i][j][k]['precision'][1]=np.round(results[i][j][k]['precision'][1], 2)
            results[i][j][k]['recall'][0]=np.round(results[i][j][k]['recall'][0], 2)
            results[i][j][k]['recall'][1]=np.round(results[i][j][k]['recall'][1], 2)

In [87]:
print("\\begin{table*}[h] \n",
      "\centering \n",
      '\caption{Aggregate analysis of evaluation metrics} \n',
      ' \\begin{tabular}{c|c|c|ccc|ccc|c} \n',
      ' \\hline \n',
      '\multicolumn{3}{c}{} & \multicolumn{3}{c}{Macro averaging} & \multicolumn{3}{c}{Micro averaging} & \multicolumn{1}{c}{} \\\ \n',
      '\\hline \n',
      ' Classifier & Feature extraction & Accuracy & F1 Score & Precision & Recall & F1 Score & Precision & Recall \\\ \n',
      ' \\hline \n',
      '\\multirow{4}{*}{LSTM}   & W2V/CNN &'+str(results['lstm']['w2v']['accuracy'][0])+' $ \pm $ '+str(results['lstm']['w2v']['accuracy'][1])+'&'+str(results['lstm']['w2v']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['w2v']['macro avg']['f1-score'][1])+'&'+str(results['lstm']['w2v']['macro avg']['precision'][0])+' $ \pm $ '+str(results['lstm']['w2v']['macro avg']['precision'][1])+'&'+str(results['lstm']['w2v']['macro avg']['recall'][0])+' $ \pm $ '+str(results['lstm']['w2v']['macro avg']['recall'][1])+'&'+str(results['lstm']['w2v']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['w2v']['weighted avg']['f1-score'][1])+'&'+str(results['lstm']['w2v']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['lstm']['w2v']['weighted avg']['precision'][1])+'&'+str(results['lstm']['w2v']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['lstm']['w2v']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & Doc2Vec &'+str(results['lstm']['d2v']['accuracy'][0])+' $ \pm $ '+str(results['lstm']['d2v']['accuracy'][1])+'&'+str(results['lstm']['d2v']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['d2v']['macro avg']['f1-score'][1])+'&'+str(results['lstm']['d2v']['macro avg']['precision'][0])+' $ \pm $ '+str(results['lstm']['d2v']['macro avg']['precision'][1])+'&'+str(results['lstm']['d2v']['macro avg']['recall'][0])+' $ \pm $ '+str(results['lstm']['d2v']['macro avg']['recall'][1])+'&'+str(results['lstm']['d2v']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['d2v']['weighted avg']['f1-score'][1])+'&'+str(results['lstm']['d2v']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['lstm']['d2v']['weighted avg']['precision'][1])+'&'+str(results['lstm']['d2v']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['lstm']['d2v']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & TFIDF &'+str(results['lstm']['tfidf']['accuracy'][0])+' $ \pm $ '+str(results['lstm']['tfidf']['accuracy'][1])+'&'+str(results['lstm']['tfidf']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['tfidf']['macro avg']['f1-score'][1])+'&'+str(results['lstm']['tfidf']['macro avg']['precision'][0])+' $ \pm $ '+str(results['lstm']['tfidf']['macro avg']['precision'][1])+'&'+str(results['lstm']['tfidf']['macro avg']['recall'][0])+' $ \pm $ '+str(results['lstm']['tfidf']['macro avg']['recall'][1])+'&'+str(results['lstm']['tfidf']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['tfidf']['weighted avg']['f1-score'][1])+'&'+str(results['lstm']['tfidf']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['lstm']['tfidf']['weighted avg']['precision'][1])+'&'+str(results['lstm']['tfidf']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['lstm']['tfidf']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & BERT &'+str(results['lstm']['bert']['accuracy'][0])+' $ \pm $ '+str(results['lstm']['bert']['accuracy'][1])+'&'+str(results['lstm']['bert']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['bert']['macro avg']['f1-score'][1])+'&'+str(results['lstm']['bert']['macro avg']['precision'][0])+' $ \pm $ '+str(results['lstm']['bert']['macro avg']['precision'][1])+'&'+str(results['lstm']['bert']['macro avg']['recall'][0])+' $ \pm $ '+str(results['lstm']['bert']['macro avg']['recall'][1])+'&'+str(results['lstm']['bert']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['bert']['weighted avg']['f1-score'][1])+'&'+str(results['lstm']['bert']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['lstm']['bert']['weighted avg']['precision'][1])+'&'+str(results['lstm']['bert']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['lstm']['bert']['weighted avg']['recall'][1])+'\\\ \n',
      ' \\hline \n',
      '\\multirow{4}{*}{MLP}    & W2V &'+str(results['mlp']['w2v']['accuracy'][0])+' $ \pm $ '+str(results['mlp']['w2v']['accuracy'][1])+'&'+str(results['mlp']['w2v']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['w2v']['macro avg']['f1-score'][1])+'&'+str(results['lstm']['w2v']['macro avg']['precision'][0])+' $ \pm $ '+str(results['mlp']['w2v']['macro avg']['precision'][1])+'&'+str(results['mlp']['w2v']['macro avg']['recall'][0])+' $ \pm $ '+str(results['mlp']['w2v']['macro avg']['recall'][1])+'&'+str(results['mlp']['w2v']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['mlp']['w2v']['weighted avg']['f1-score'][1])+'&'+str(results['mlp']['w2v']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['mlp']['w2v']['weighted avg']['precision'][1])+'&'+str(results['mlp']['w2v']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['mlp']['w2v']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & Doc2Vec &'+str(results['mlp']['d2v']['accuracy'][0])+' $ \pm $ '+str(results['mlp']['d2v']['accuracy'][1])+'&'+str(results['mlp']['d2v']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['mlp']['d2v']['macro avg']['f1-score'][1])+'&'+str(results['mlp']['d2v']['macro avg']['precision'][0])+' $ \pm $ '+str(results['mlp']['d2v']['macro avg']['precision'][1])+'&'+str(results['mlp']['d2v']['macro avg']['recall'][0])+' $ \pm $ '+str(results['mlp']['d2v']['macro avg']['recall'][1])+'&'+str(results['mlp']['d2v']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['mlp']['d2v']['weighted avg']['f1-score'][1])+'&'+str(results['mlp']['d2v']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['mlp']['d2v']['weighted avg']['precision'][1])+'&'+str(results['mlp']['d2v']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['mlp']['d2v']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & TFIDF &'+str(results['mlp']['tfidf']['accuracy'][0])+' $ \pm $ '+str(results['mlp']['tfidf']['accuracy'][1])+'&'+str(results['mlp']['tfidf']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['mlp']['tfidf']['macro avg']['f1-score'][1])+'&'+str(results['mlp']['tfidf']['macro avg']['precision'][0])+' $ \pm $ '+str(results['mlp']['tfidf']['macro avg']['precision'][1])+'&'+str(results['mlp']['tfidf']['macro avg']['recall'][0])+' $ \pm $ '+str(results['mlp']['tfidf']['macro avg']['recall'][1])+'&'+str(results['mlp']['tfidf']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['mlp']['tfidf']['weighted avg']['f1-score'][1])+'&'+str(results['mlp']['tfidf']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['mlp']['tfidf']['weighted avg']['precision'][1])+'&'+str(results['mlp']['tfidf']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['mlp']['tfidf']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & BERT &'+str(results['mlp']['bert']['accuracy'][0])+' $ \pm $ '+str(results['mlp']['bert']['accuracy'][1])+'&'+str(results['mlp']['bert']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['mlp']['bert']['macro avg']['f1-score'][1])+'&'+str(results['mlp']['bert']['macro avg']['precision'][0])+' $ \pm $ '+str(results['mlp']['bert']['macro avg']['precision'][1])+'&'+str(results['mlp']['bert']['macro avg']['recall'][0])+' $ \pm $ '+str(results['mlp']['bert']['macro avg']['recall'][1])+'&'+str(results['mlp']['bert']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['mlp']['bert']['weighted avg']['f1-score'][1])+'&'+str(results['mlp']['bert']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['mlp']['bert']['weighted avg']['precision'][1])+'&'+str(results['mlp']['bert']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['mlp']['bert']['weighted avg']['recall'][1])+'\\\ \n',
      ' \\hline \n',
      '\\multirow{4}{*}{XGBoost}    & W2V &'+str(results['xgboost']['w2v']['accuracy'][0])+' $ \pm $ '+str(results['xgboost']['w2v']['accuracy'][1])+'&'+str(results['xgboost']['w2v']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['lstm']['w2v']['macro avg']['f1-score'][1])+'&'+str(results['lstm']['w2v']['macro avg']['precision'][0])+' $ \pm $ '+str(results['xgboost']['w2v']['macro avg']['precision'][1])+'&'+str(results['xgboost']['w2v']['macro avg']['recall'][0])+' $ \pm $ '+str(results['xgboost']['w2v']['macro avg']['recall'][1])+'&'+str(results['xgboost']['w2v']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['xgboost']['w2v']['weighted avg']['f1-score'][1])+'&'+str(results['xgboost']['w2v']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['xgboost']['w2v']['weighted avg']['precision'][1])+'&'+str(results['xgboost']['w2v']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['xgboost']['w2v']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & Doc2Vec &'+str(results['xgboost']['d2v']['accuracy'][0])+' $ \pm $ '+str(results['xgboost']['d2v']['accuracy'][1])+'&'+str(results['xgboost']['d2v']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['xgboost']['d2v']['macro avg']['f1-score'][1])+'&'+str(results['xgboost']['d2v']['macro avg']['precision'][0])+' $ \pm $ '+str(results['xgboost']['d2v']['macro avg']['precision'][1])+'&'+str(results['xgboost']['d2v']['macro avg']['recall'][0])+' $ \pm $ '+str(results['xgboost']['d2v']['macro avg']['recall'][1])+'&'+str(results['xgboost']['d2v']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['xgboost']['d2v']['weighted avg']['f1-score'][1])+'&'+str(results['xgboost']['d2v']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['xgboost']['d2v']['weighted avg']['precision'][1])+'&'+str(results['xgboost']['d2v']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['xgboost']['d2v']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & TFIDF &'+str(results['xgboost']['tfidf']['accuracy'][0])+' $ \pm $ '+str(results['xgboost']['tfidf']['accuracy'][1])+'&'+str(results['xgboost']['tfidf']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['xgboost']['tfidf']['macro avg']['f1-score'][1])+'&'+str(results['xgboost']['tfidf']['macro avg']['precision'][0])+' $ \pm $ '+str(results['xgboost']['tfidf']['macro avg']['precision'][1])+'&'+str(results['xgboost']['tfidf']['macro avg']['recall'][0])+' $ \pm $ '+str(results['xgboost']['tfidf']['macro avg']['recall'][1])+'&'+str(results['xgboost']['tfidf']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['xgboost']['tfidf']['weighted avg']['f1-score'][1])+'&'+str(results['xgboost']['tfidf']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['xgboost']['tfidf']['weighted avg']['precision'][1])+'&'+str(results['xgboost']['tfidf']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['xgboost']['tfidf']['weighted avg']['recall'][1])+'\\\ \n',
                              ' & BERT &'+str(results['xgboost']['bert']['accuracy'][0])+' $ \pm $ '+str(results['xgboost']['bert']['accuracy'][1])+'&'+str(results['xgboost']['bert']['macro avg']['f1-score'][0])+' $ \pm $ '+str(results['xgboost']['bert']['macro avg']['f1-score'][1])+'&'+str(results['xgboost']['bert']['macro avg']['precision'][0])+' $ \pm $ '+str(results['xgboost']['bert']['macro avg']['precision'][1])+'&'+str(results['xgboost']['bert']['macro avg']['recall'][0])+' $ \pm $ '+str(results['xgboost']['bert']['macro avg']['recall'][1])+'&'+str(results['xgboost']['bert']['weighted avg']['f1-score'][0])+' $ \pm $ '+str(results['xgboost']['bert']['weighted avg']['f1-score'][1])+'&'+str(results['xgboost']['bert']['weighted avg']['precision'][0])+' $ \pm $ '+str(results['xgboost']['bert']['weighted avg']['precision'][1])+'&'+str(results['xgboost']['bert']['weighted avg']['recall'][0])+' $ \pm $ '+str(results['xgboost']['bert']['weighted avg']['recall'][1])+'\\\ \n',
      '\\hline \n', 
      ' \end{tabular}% \n',
      '\label{tab:lab}% \n',
   '\end{table*}% \n')

\begin{table*}[h] 
 \centering 
 \caption{Aggregate analysis of evaluation metrics} 
  \begin{tabular}{c|c|c|ccc|ccc|c} 
  \hline 
 \multicolumn{3}{c}{} & \multicolumn{3}{c}{Macro averaging} & \multicolumn{3}{c}{Micro averaging} & \multicolumn{1}{c}{} \\ 
 \hline 
  Classifier & Feature extraction & Accuracy & F1 Score & Precision & Recall & F1 Score & Precision & Recall \\ 
  \hline 
 \multirow{4}{*}{LSTM}   & W2V/CNN &0.93 $ \pm $ 0.01&0.88 $ \pm $ 0.01&0.92 $ \pm $ 0.01&0.85 $ \pm $ 0.02&0.92 $ \pm $ 0.01&0.93 $ \pm $ 0.01&0.93 $ \pm $ 0.01\\ 
  & Doc2Vec &0.82 $ \pm $ 0.01&0.76 $ \pm $ 0.02&0.77 $ \pm $ 0.02&0.75 $ \pm $ 0.02&0.82 $ \pm $ 0.01&0.82 $ \pm $ 0.01&0.82 $ \pm $ 0.01\\ 
  & TFIDF &0.9 $ \pm $ 0.01&0.85 $ \pm $ 0.01&0.85 $ \pm $ 0.01&0.85 $ \pm $ 0.02&0.9 $ \pm $ 0.01&0.9 $ \pm $ 0.01&0.9 $ \pm $ 0.01\\ 
  & BERT &0.93 $ \pm $ 0.01&0.89 $ \pm $ 0.01&0.92 $ \pm $ 0.01&0.87 $ \pm $ 0.02&0.93 $ \pm $ 0.01&0.93 $ \pm $ 0.01&0.93 $ \pm $ 0.01\\ 
  \hline 
 \multirow{4}{*}{MLP