# Tuning classifiers (XGBoost)

### Setup

In [1]:
%run -i 'random_state.py'
from packages import *
from clean_functions import *
from tokenizer import *
from tuners import *

Defining general Grid

In [2]:
hyper_xgboost = {'max_depth': [3, 5, 7, 9],
                 'learning_rate': [.1, .5, 1.],
                 'gamma': [0, .00001, .0001, .001, .01, .1, 1],
                 'reg_lambda': [10., 50., 100., 300., 500., 1000.],
                 'n_est':[0],
                 'score': [0], 
                 'lower_ci': [0], 
                 'upper_ci': [0]}

# W2V/XGBoost

### Getting data ready

Loading data

In [3]:
X, y=np.load('data/X_w2v.npy'),np.load('data/y_w2v.npy')

In [4]:
N=np.sum(X[:,:,:,0]!=0, axis=2)
N=np.expand_dims(N, axis=2)
N[N==0]=1
X=np.sum(X, axis=2)
X=X/N
X=X.reshape((X.shape[0],-1))

Turning y into numeric:

In [5]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

Splitting the dataset in train, test and validation set:

In [6]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

np.shape(X_train),np.shape(y_train)

((4514, 500), (4514,))

### Tuning classification model

Grid

In [7]:
hyper=expand_grid(hyper_xgboost.copy(), random_seed=random_seed)
hyper=hyper[['max_depth','learning_rate','gamma','reg_lambda', 'n_est', 'score','lower_ci','upper_ci']]

np.shape(hyper)

(50, 8)

Search

In [8]:
%%time
hyper=tune_xgboost(hyper, X_train, y_train, X_val, y_val, random_seed)
hyper.to_csv('hyper/hyper_xgboost_w2v')

100%|██████████| 50/50 [18:17<00:00, 21.96s/it]

CPU times: user 1h 46min 28s, sys: 17.3 s, total: 1h 46min 45s
Wall time: 18min 18s





In [9]:
hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(20)

Unnamed: 0,max_depth,learning_rate,gamma,reg_lambda,n_est,score,lower_ci,upper_ci
43,9,0.5,0.0001,50.0,23,0.91938,0.898369,0.940391
28,9,0.1,0.0,100.0,97,0.91938,0.898369,0.940391
29,9,0.5,0.0,50.0,23,0.91938,0.898369,0.940391
6,9,1.0,0.1,1000.0,32,0.91938,0.898369,0.940391
17,3,0.1,0.1,50.0,127,0.91938,0.898369,0.940391
46,5,0.1,0.01,100.0,122,0.92093,0.900105,0.941755
0,7,0.5,0.0,1000.0,73,0.92093,0.900105,0.941755
27,5,1.0,0.1,300.0,41,0.92093,0.900105,0.941755
11,9,1.0,0.1,100.0,29,0.92093,0.900105,0.941755
7,7,1.0,0.0,10.0,34,0.92093,0.900105,0.941755


# BERT/XGBoost

### Getting data ready

Loading data:

In [10]:
X, y = np.load('data/X_bert.npy'),np.load('data/y_bert.npy')

In [11]:
X=X.reshape((X.shape[0],-1))

Turning y into numeric:

In [12]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

Splitting the dataset in train, test and validation set:

In [13]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

np.shape(X_train),np.shape(y_train)

((4514, 3840), (4514,))

### Tuning classification model

Grid

In [14]:
hyper=expand_grid(hyper_xgboost.copy(), random_seed=random_seed)
hyper=hyper[['max_depth','learning_rate','gamma','reg_lambda', 'n_est', 'score','lower_ci','upper_ci']]

np.shape(hyper)

(50, 8)

Search

In [15]:
hyper=tune_xgboost(hyper, X_train, y_train, X_val, y_val, random_seed)
hyper.to_csv('hyper/hyper_xgboost_bert')

100%|██████████| 50/50 [1:04:54<00:00, 77.89s/it] 


In [16]:
hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(20)

Unnamed: 0,max_depth,learning_rate,gamma,reg_lambda,n_est,score,lower_ci,upper_ci
42,7,1.0,0.01,300.0,30,0.91938,0.898369,0.940391
23,9,1.0,0.0,500.0,10,0.92093,0.900105,0.941755
11,9,1.0,0.1,100.0,22,0.922481,0.901843,0.943119
15,7,0.5,0.0001,300.0,32,0.922481,0.901843,0.943119
43,9,0.5,0.0001,50.0,25,0.922481,0.901843,0.943119
35,9,1.0,0.0,300.0,22,0.922481,0.901843,0.943119
1,5,0.5,1e-05,500.0,59,0.922481,0.901843,0.943119
27,5,1.0,0.1,300.0,16,0.922481,0.901843,0.943119
41,9,0.5,1.0,10.0,11,0.924031,0.903584,0.944478
49,7,0.5,0.0001,50.0,26,0.924031,0.903584,0.944478


# Doc2Vec/XGBoost

### Getting data ready

Loading data

In [17]:
X, y=np.load('data/X_d2v.npy'),np.load('data/y_d2v.npy')

In [18]:
X=X.reshape((X.shape[0],-1))

Turning y into numeric:

In [19]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

Splitting the dataset in train, test and validation set:

In [20]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

np.shape(X_train),np.shape(y_train)

((4514, 500), (4514,))

### Tuning classification model

Grid

In [21]:
hyper=expand_grid(hyper_xgboost.copy(), random_seed=random_seed)
hyper=hyper[['max_depth','learning_rate','gamma','reg_lambda', 'n_est', 'score','lower_ci','upper_ci']]

np.shape(hyper)

(50, 8)

Search

In [22]:
%%time
hyper=tune_xgboost(hyper, X_train, y_train, X_val, y_val, random_seed)
hyper.to_csv('hyper/hyper_xgboost_d2v')

100%|██████████| 50/50 [09:10<00:00, 11.02s/it]

CPU times: user 1h 12min 51s, sys: 958 ms, total: 1h 12min 52s
Wall time: 9min 10s





In [23]:
hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(20)

Unnamed: 0,max_depth,learning_rate,gamma,reg_lambda,n_est,score,lower_ci,upper_ci
5,9,0.5,1.0,50.0,44,0.874419,0.848845,0.899993
11,9,1.0,0.1,100.0,16,0.877519,0.852218,0.90282
27,5,1.0,0.1,300.0,16,0.877519,0.852218,0.90282
12,5,0.1,0.0,50.0,89,0.877519,0.852218,0.90282
7,7,1.0,0.0,10.0,37,0.877519,0.852218,0.90282
35,9,1.0,0.0,300.0,27,0.87907,0.853907,0.904233
19,3,0.5,0.1,50.0,44,0.87907,0.853907,0.904233
9,5,0.1,0.1,50.0,94,0.87907,0.853907,0.904233
22,5,1.0,1e-05,300.0,38,0.882171,0.857289,0.907053
6,9,1.0,0.1,1000.0,82,0.882171,0.857289,0.907053


# TFIDF/XGBoost

### Getting data ready

Loading data:

In [24]:
X, y=np.load('data/X_tfidf.npy'),np.load('data/y_tfidf.npy')

In [25]:
X=X.reshape((X.shape[0],-1))

Turning y into numeric:

In [26]:
encode={'H:Arquivado': 1,'H:Ativo': 2,'H:Suspenso': 3}
decode={1:'H:Arquivado',2:'H:Ativo',3:'H:Suspenso'}

for i in range(len(y)):
    y[i]=encode[y[i]]

Splitting the dataset in train, test and validation set:

In [27]:
y=np.array(y.astype(int))-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=2/3, random_state=random_seed)

np.shape(X_train),np.shape(y_train)

((4514, 20000), (4514,))

### Tuning classification model

Grid

In [28]:
hyper=expand_grid(hyper_xgboost.copy(), random_seed=random_seed)
hyper=hyper[['max_depth','learning_rate','gamma','reg_lambda', 'n_est', 'score','lower_ci','upper_ci']]

np.shape(hyper)

(50, 8)

Search

In [29]:
%%time
hyper=tune_xgboost(hyper, X_train, y_train, X_val, y_val, random_seed)
hyper.to_csv('hyper/hyper_xgboost_tfidf')

100%|██████████| 50/50 [1:28:35<00:00, 106.31s/it]

CPU times: user 11h 35min 49s, sys: 40.7 s, total: 11h 36min 30s
Wall time: 1h 28min 35s





In [30]:
hyper.iloc[np.argsort(hyper.loc[:,'score']),:].tail(20)

Unnamed: 0,max_depth,learning_rate,gamma,reg_lambda,n_est,score,lower_ci,upper_ci
36,9,0.5,1.0,100.0,15,0.906977,0.88456,0.929394
5,9,0.5,1.0,50.0,14,0.908527,0.886279,0.930775
38,7,0.1,0.1,10.0,50,0.908527,0.886279,0.930775
22,5,1.0,1e-05,300.0,53,0.911628,0.889723,0.933533
19,3,0.5,0.1,50.0,52,0.911628,0.889723,0.933533
45,5,1.0,0.01,50.0,21,0.911628,0.889723,0.933533
27,5,1.0,0.1,300.0,52,0.911628,0.889723,0.933533
43,9,0.5,0.0001,50.0,36,0.914729,0.893175,0.936283
2,9,1.0,0.0001,50.0,18,0.914729,0.893175,0.936283
29,9,0.5,0.0,50.0,34,0.916279,0.894904,0.937654
