In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import GradientBoostingClassifier

from preproc3 import na, encode, split, binarize, shuffle_in_unison, scale
from imblearn.over_sampling import SMOTE

In [72]:
DATA='ugrin2020-vehiculo-usado-multiclase/'
TRAIN=DATA+'train.csv'
TEST=DATA+'test.csv'

PREPROCESSED_DATA='preprocessed_data/'
RESULTS='results/'

In [73]:
train = pd.read_csv(TRAIN) # Cargo datos de entrenamiento
test = pd.read_csv(TEST) # Cargo datos de test

# Eliminamos el campo id ya que no se debe usar para predecir
test_ids = test['id']
del test['id']
del train['id']

# Cambiamos el nombre a la columna Año para poder manejarla correctamente
train.rename(columns = {'Año':'Anio'}, inplace = True)
test.rename(columns = {'Año':'Anio'}, inplace = True)

In [75]:
train_label = train.Precio_cat
del train['Precio_cat']

In [76]:
train2, val, train2_label, val_label = train_test_split(train, train_label, stratify=train_label, test_size=0.25, random_state=42)

In [77]:
train2['Precio_cat']=train2_label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [79]:
train2, val = na(train2, val)
val['label']=val_label
val=val[val.Combustible!='Electric']
val=val.dropna()

val_label=val.label
del val['label']

train2, val = encode (train2, val)
train2, train2_label, val = split(train2, val)
train2, val = binarize(train2, val)
train2, train2_label = SMOTE(random_state=25).fit_resample(train2, train2_label)
shuffle_in_unison(train2, train2_label)
train2, val = scale(train2, val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [80]:
#np.savez_compressed(PREPROCESSED_DATA+'binScale-val', train2, train2_label, val, val_label)

In [101]:
results=pd.DataFrame(columns=['n','lr','s','d','acc'])
param_grid={'n_estimators':[450,500,550,600], 'learning_rate':[0.1,0.125,0.15,0.175,0.2], 'subsample':[0.8,0.9,1.0], 'max_depth':[2,3,4,5,6]}
for n in param_grid['n_estimators']:
    for lr in param_grid['learning_rate']:
        for s in param_grid['subsample']:
            for d in param_grid['max_depth']:
                print(n, lr, s, d)
                model= GradientBoostingClassifier(n_estimators=n, learning_rate=lr, subsample=s, max_depth=d)
                model.fit(train2, train2_label)
                results=results.append(pd.DataFrame([[n,lr,s,d,accuracy_score(val_label,model.predict(val))]],columns=['n','lr','s','d','acc']),ignore_index=True)

450 0.1 0.8 2


Unnamed: 0,n,lr,s,d,acc
0,450,0.1,0.8,2,0.60402


In [102]:
results.sort_values(by='acc',ascending=False)

Unnamed: 0,n,lr,s,d,acc
0,450,0.1,0.8,2,0.60402


In [42]:
model.fit(train2,train2_label)
pred=model.predict(val)
accuracy_score(val_label,pred)

0.8329938900203666

In [9]:
scores=cross_val_score(model, train, label, cv=5)
print(scores)
print(np.mean(scores))

[0.90575342 0.91506849 0.91123288 0.92767123 0.91178082]
0.9143013698630137


## Generar fichero de Kaggle

In [11]:
model.fit(train,label)
# Ahora predecimos
predict = model.predict(test)
predict = list(map(int,predict))
# Generamos 
df_result = pd.DataFrame({'id': test_ids, 'Precio_cat': predict})
df_result.to_csv(RESULTS+"try15.csv", index=False)