In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier

from preproc3 import na, encode, split, binarize, shuffle_in_unison, scale
from imblearn.over_sampling import SMOTE

In [26]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier

In [3]:
DATA='ugrin2020-vehiculo-usado-multiclase/'
TRAIN=DATA+'train.csv'
TEST=DATA+'test.csv'

PREPROCESSED_DATA='preprocessed_data/'
RESULTS='results/'

In [4]:
train = pd.read_csv(TRAIN) # Cargo datos de entrenamiento
test = pd.read_csv(TEST) # Cargo datos de test

# Eliminamos el campo id ya que no se debe usar para predecir
test_ids = test['id']
del test['id']
del train['id']

# Cambiamos el nombre a la columna Año para poder manejarla correctamente
train.rename(columns = {'Año':'Anio'}, inplace = True)
test.rename(columns = {'Año':'Anio'}, inplace = True)

In [5]:
train_label = train.Precio_cat
del train['Precio_cat']

In [6]:
train2, val, train2_label, val_label = train_test_split(train, train_label, stratify=train_label, test_size=0.25, random_state=42)

In [7]:
train2['Precio_cat']=train2_label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
train2, val = na(train2, val)
val['label']=val_label
val=val[val.Combustible!='Electric']
val=val.dropna()

val_label=val.label
del val['label']

train2, val = encode (train2, val)
train2, train2_label, val = split(train2, val)
train2, val = binarize(train2, val)
train2, train2_label = SMOTE(random_state=25).fit_resample(train2, train2_label)
shuffle_in_unison(train2, train2_label)
train2, val = scale(train2, val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
#np.savez_compressed(PREPROCESSED_DATA+'binScale-val', train2, train2_label, val, val_label)

In [29]:
results=pd.DataFrame(columns=['iter','leaf','lr','acc'])

In [30]:
param_grid={'max_iter':[75,100,150,200,300], 'max_leaf_nodes':[27,29,31,33],'learning_rate':[0.08,0.1,0.12]}
for iters in param_grid['max_iter']:
    for leaf in param_grid['max_leaf_nodes']:
        for lr in param_grid['learning_rate']:
            print(iters, leaf, lr)
            model=HistGradientBoostingClassifier(max_iter=iters, max_leaf_nodes=leaf, learning_rate=lr)
            model.fit(train2, train2_label)
            results=results.append(pd.DataFrame([[iters, leaf, lr,accuracy_score(val_label,model.predict(val))]],columns=['iter','leaf','lr','acc']),ignore_index=True)

75 27 0.08
75 27 0.1
75 27 0.12
75 29 0.08
75 29 0.1
75 29 0.12
75 31 0.08
75 31 0.1
75 31 0.12
75 33 0.08
75 33 0.1
75 33 0.12
100 27 0.08
100 27 0.1
100 27 0.12
100 29 0.08
100 29 0.1
100 29 0.12
100 31 0.08
100 31 0.1
100 31 0.12
100 33 0.08
100 33 0.1
100 33 0.12
150 27 0.08
150 27 0.1
150 27 0.12
150 29 0.08
150 29 0.1
150 29 0.12
150 31 0.08
150 31 0.1
150 31 0.12
150 33 0.08
150 33 0.1
150 33 0.12
200 27 0.08
200 27 0.1
200 27 0.12
200 29 0.08
200 29 0.1
200 29 0.12
200 31 0.08
200 31 0.1
200 31 0.12
200 33 0.08
200 33 0.1
200 33 0.12
300 27 0.08
300 27 0.1
300 27 0.12
300 29 0.08
300 29 0.1
300 29 0.12
300 31 0.08
300 31 0.1
300 31 0.12
300 33 0.08
300 33 0.1
300 33 0.12


In [31]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(results.sort_values(by='acc',ascending=False))

Unnamed: 0,iter,leaf,lr,acc
7,75,31,0.1,0.837186
11,75,33,0.12,0.834171
19,100,31,0.1,0.833166
16,100,29,0.1,0.833166
2,75,27,0.12,0.832161
9,75,33,0.08,0.831156
14,100,27,0.12,0.831156
12,100,27,0.08,0.830151
31,150,31,0.1,0.830151
5,75,29,0.12,0.830151


In [13]:
model=XGBClassifier(n_jobs=4)

In [14]:
model

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [20]:
results=pd.DataFrame(columns=['n','d','acc'])

In [24]:
param_grid={'n_estimators':[75,100,150,200,300,400,500], 'max_depth':[3,8,14,26,None]}
for n in param_grid['n_estimators']:
        for d in param_grid['max_depth']:
            print(n, d)
            model= XGBClassifier(n_estimators=n, max_depth=d, n_jobs=4, eval_metric='mlogloss')
            model.fit(train2, train2_label)
            results=results.append(pd.DataFrame([[n,d,accuracy_score(val_label,model.predict(val))]],columns=['n','d','acc']),ignore_index=True)

75 3




75 8
75 14
75 26
75 None
100 3
100 8
100 14
100 26
100 None
150 3
150 8
150 14
150 26
150 None
200 3
200 8
200 14
200 26
200 None
300 3
300 8
300 14
300 26
300 None
400 3
400 8
400 14
400 26
400 None
500 3
500 8
500 14
500 26
500 None


In [25]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(results.sort_values(by='acc',ascending=False))

Unnamed: 0,n,d,acc
15,200,3.0,0.835176
50,200,3.0,0.835176
60,400,3.0,0.834171
25,400,3.0,0.834171
10,150,3.0,0.831156
45,150,3.0,0.831156
65,500,3.0,0.831156
30,500,3.0,0.831156
5,100,3.0,0.830151
40,100,3.0,0.830151


In [19]:
results=pd.DataFrame(columns=['C','acc'])

In [22]:
param_grid={'C':[0.25,0.5,1,2.5,5,10,15,20,25,30,35,40,45,50,60,70]}
for c in param_grid['C']:
    print(c)
    model=SVC(C=c)
    model.fit(train2, train2_label)
    results=results.append(pd.DataFrame([[c,accuracy_score(val_label,model.predict(val))]],columns=['C','acc']),ignore_index=True)

0.25
0.5
1
2.5
5
10
15
20
25
30
35
40
45
50
60
70


In [23]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(results.sort_values(by='acc',ascending=False))

Unnamed: 0,C,acc
10,40.0,0.79196
25,40.0,0.79196
23,30.0,0.790955
24,35.0,0.790955
9,30.0,0.790955
22,25.0,0.788945
21,20.0,0.788945
27,50.0,0.788945
13,70.0,0.788945
11,50.0,0.788945


In [8]:
results=pd.DataFrame(columns=['shape','early','alpha','acc'])

In [9]:
param_grid={'hidden_layer_sizes':[(50),(100),(150),(200),(250),(50,50),(100,100),(150,150),(200,200),(250,250)], 'early_stopping':[True,False],'alpha':[0.00005,0.0001,0.00015]}
for s in param_grid['hidden_layer_sizes']:
    for early in param_grid['early_stopping']:
        for a in param_grid['alpha']:
            print(s,early,a)
            model=MLPClassifier(hidden_layer_sizes=s,alpha=a,early_stopping=early,max_iter=1000)
            model.fit(train2, train2_label)
            results=results.append(pd.DataFrame([[s,early,a,accuracy_score(val_label,model.predict(val))]],columns=['shape','early','alpha','acc']),ignore_index=True)

50 True 5e-05
50 True 0.0001
50 True 0.00015
50 False 5e-05
50 False 0.0001
50 False 0.00015
100 True 5e-05
100 True 0.0001
100 True 0.00015
100 False 5e-05
100 False 0.0001
100 False 0.00015
150 True 5e-05
150 True 0.0001
150 True 0.00015
150 False 5e-05
150 False 0.0001
150 False 0.00015
200 True 5e-05
200 True 0.0001
200 True 0.00015
200 False 5e-05
200 False 0.0001
200 False 0.00015
250 True 5e-05
250 True 0.0001
250 True 0.00015
250 False 5e-05
250 False 0.0001
250 False 0.00015
(50, 50) True 5e-05
(50, 50) True 0.0001
(50, 50) True 0.00015
(50, 50) False 5e-05
(50, 50) False 0.0001
(50, 50) False 0.00015
(100, 100) True 5e-05
(100, 100) True 0.0001
(100, 100) True 0.00015
(100, 100) False 5e-05
(100, 100) False 0.0001
(100, 100) False 0.00015
(150, 150) True 5e-05
(150, 150) True 0.0001
(150, 150) True 0.00015
(150, 150) False 5e-05
(150, 150) False 0.0001
(150, 150) False 0.00015
(200, 200) True 5e-05
(200, 200) True 0.0001
(200, 200) True 0.00015
(200, 200) False 5e-05
(200, 20

In [10]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(results.sort_values(by='acc',ascending=False))

Unnamed: 0,shape,early,alpha,acc
7,100,True,0.0001,0.815075
2,50,True,0.00015,0.81005
25,250,True,0.0001,0.805025
58,"(250, 250)",False,0.0001,0.80402
24,250,True,5e-05,0.803015
55,"(250, 250)",True,0.0001,0.80201
30,"(50, 50)",True,5e-05,0.801005
22,200,False,0.0001,0.798995
28,250,False,0.0001,0.798995
23,200,False,0.00015,0.796985


In [10]:
results=pd.DataFrame(columns=['n','lr','s','d','acc'])

In [17]:
param_grid={'n_estimators':[450,500,550,600], 'learning_rate':[0.1,0.125,0.15,0.175,0.2], 'subsample':[0.8,0.9], 'max_depth':[2,3,4]}
for n in param_grid['n_estimators'][3:4]:
    for lr in param_grid['learning_rate']:
        for s in param_grid['subsample']:
            for d in param_grid['max_depth']:
                print(n, lr, s, d)
                model= GradientBoostingClassifier(n_estimators=n, learning_rate=lr, subsample=s, max_depth=d)
                model.fit(train2, train2_label)
                results=results.append(pd.DataFrame([[n,lr,s,d,accuracy_score(val_label,model.predict(val))]],columns=['n','lr','s','d','acc']),ignore_index=True)

600 0.1 0.8 2
600 0.1 0.8 3
600 0.1 0.8 4
600 0.1 0.9 2
600 0.1 0.9 3
600 0.1 0.9 4
600 0.125 0.8 2
600 0.125 0.8 3
600 0.125 0.8 4
600 0.125 0.9 2
600 0.125 0.9 3
600 0.125 0.9 4
600 0.15 0.8 2
600 0.15 0.8 3
600 0.15 0.8 4
600 0.15 0.9 2
600 0.15 0.9 3
600 0.15 0.9 4
600 0.175 0.8 2
600 0.175 0.8 3
600 0.175 0.8 4
600 0.175 0.9 2
600 0.175 0.9 3
600 0.175 0.9 4
600 0.2 0.8 2
600 0.2 0.8 3
600 0.2 0.8 4
600 0.2 0.9 2
600 0.2 0.9 3
600 0.2 0.9 4


In [11]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(results.sort_values(by='acc',ascending=False))

Unnamed: 0,n,lr,s,d,acc


In [12]:
results=pd.DataFrame(columns=['n','lr','leaves','d','acc'])

In [18]:
param_grid={'learning_rate':[0.07,0.08,0.1,0.12],'n_estimators':[125,150,200],'num_leaves':[25,27,29,31], 'max_depth':[3,8,-1]}
for n in param_grid['n_estimators']:
    for lr in param_grid['learning_rate']:
        for leaves in param_grid['num_leaves']:
            for d in param_grid['max_depth']:
                print(n, lr, leaves, d)
                model = LGBMClassifier(n_estimators=n, learning_rate=lr, num_leaves=leaves, max_depth=d)
                model.fit(train2, train2_label)
                results=results.append(pd.DataFrame([[n,lr,leaves,d,accuracy_score(val_label,model.predict(val))]],columns=['n','lr','leaves','d','acc']),ignore_index=True)

125 0.07 25 3
125 0.07 25 8
125 0.07 25 -1
125 0.07 27 3
125 0.07 27 8
125 0.07 27 -1
125 0.07 29 3
125 0.07 29 8
125 0.07 29 -1
125 0.07 31 3
125 0.07 31 8
125 0.07 31 -1
125 0.08 25 3
125 0.08 25 8
125 0.08 25 -1
125 0.08 27 3
125 0.08 27 8
125 0.08 27 -1
125 0.08 29 3
125 0.08 29 8
125 0.08 29 -1
125 0.08 31 3
125 0.08 31 8
125 0.08 31 -1
125 0.1 25 3
125 0.1 25 8
125 0.1 25 -1
125 0.1 27 3
125 0.1 27 8
125 0.1 27 -1
125 0.1 29 3
125 0.1 29 8
125 0.1 29 -1
125 0.1 31 3
125 0.1 31 8
125 0.1 31 -1
125 0.12 25 3
125 0.12 25 8
125 0.12 25 -1
125 0.12 27 3
125 0.12 27 8
125 0.12 27 -1
125 0.12 29 3
125 0.12 29 8
125 0.12 29 -1
125 0.12 31 3
125 0.12 31 8
125 0.12 31 -1
150 0.07 25 3
150 0.07 25 8
150 0.07 25 -1
150 0.07 27 3
150 0.07 27 8
150 0.07 27 -1
150 0.07 29 3
150 0.07 29 8
150 0.07 29 -1
150 0.07 31 3
150 0.07 31 8
150 0.07 31 -1
150 0.08 25 3
150 0.08 25 8
150 0.08 25 -1
150 0.08 27 3
150 0.08 27 8
150 0.08 27 -1
150 0.08 29 3
150 0.08 29 8
150 0.08 29 -1
150 0.08 31 3
150 0.08 

In [19]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(results.sort_values(by='acc',ascending=False))

Unnamed: 0,n,lr,leaves,d,acc
97,125,0.08,27,8,0.846231
55,125,0.08,27,8,0.846231
94,125,0.08,25,8,0.842211
145,150,0.08,27,8,0.842211
28,100,0.08,27,8,0.839196
82,125,0.07,25,8,0.838191
219,200,0.12,29,3,0.838191
130,150,0.07,25,8,0.838191
222,200,0.12,31,3,0.838191
213,200,0.12,25,3,0.838191


In [42]:
model.fit(train2,train2_label)
pred=model.predict(val)
accuracy_score(val_label,pred)

0.8329938900203666

In [9]:
scores=cross_val_score(model, train, label, cv=5)
print(scores)
print(np.mean(scores))

[0.90575342 0.91506849 0.91123288 0.92767123 0.91178082]
0.9143013698630137
