## Imports

In [1]:
import math
import numpy as np
import pandas
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import confusion_matrix, accuracy_score

# Preparando el dataset de entrenamiento

In [2]:
data_train = pandas.read_csv('data/perrosTrainAllComplete.csv')

data_train

Unnamed: 0,Mascota,Edad,Tamaño,Sexo,Patron de pelaje,Color de pelaje 1,Color de pelaje 2,Color de pelaje 3,Largo de pelaje,Color de ojos,Largo de hocico,Largo de cola,Largo de orejas,Tipo de orejas
0,1,Adulto,Mediano,Macho,Bicolor,Blanco,Negro,,Corto,Marron oscuro,Corto,Corto,Cortas,Caidas
1,2,Cachorro,Mediano,Hembra,Bicolor,Blanco,Marron,,Corto,Marron oscuro,Largo,Largo,Cortas,Caidas
2,3,Adulto,Mediano,Macho,Liso,Negro,,,Corto,Marron oscuro,Largo,Largo,Largas,Paradas
3,4,Adulto,Mediano,Hembra,Liso,Dorado,,,Corto,Marron oscuro,Largo,Largo,Cortas,Paradas
4,5,Cachorro,Chico,Hembra,Liso,Negro,,,Corto,Marron oscuro,Corto,Corto,Cortas,Caidas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,1910,Adulto,Chico,Macho,Bicolor,Dorado,Blanco,,Corto,Marron oscuro,Mediano,Mediano,Mediano,Caidas
316,1919,Cachorro,Mediano,Macho,Tricolor,Negro,Blanco,Marron,Largo,Marron oscuro,Corto,Corto,Mediano,Caidas
317,1940,Cachorro,Mediano,Macho,Liso,Marron,,,Corto,Marron claro,Largo,Mediano,Cortas,Caidas
318,1950,Adulto,Chico,Hembra,Bicolor,Negro,Marron,,Largo,Marron oscuro,Mediano,Corto,Largas,Caidas


### Obtenemos la cantidad de valores vacios

In [3]:
null_value_stats = data_train.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Color de pelaje 2    129
Color de pelaje 3    291
dtype: int64

### Lleno los nulos con strings vacios y quito la columna "Mascota"

In [4]:
data_train.fillna("NaN", inplace=True)
prepared_data_train = data_train.drop('Mascota', axis=1)
prepared_data_train

Unnamed: 0,Edad,Tamaño,Sexo,Patron de pelaje,Color de pelaje 1,Color de pelaje 2,Color de pelaje 3,Largo de pelaje,Color de ojos,Largo de hocico,Largo de cola,Largo de orejas,Tipo de orejas
0,Adulto,Mediano,Macho,Bicolor,Blanco,Negro,,Corto,Marron oscuro,Corto,Corto,Cortas,Caidas
1,Cachorro,Mediano,Hembra,Bicolor,Blanco,Marron,,Corto,Marron oscuro,Largo,Largo,Cortas,Caidas
2,Adulto,Mediano,Macho,Liso,Negro,,,Corto,Marron oscuro,Largo,Largo,Largas,Paradas
3,Adulto,Mediano,Hembra,Liso,Dorado,,,Corto,Marron oscuro,Largo,Largo,Cortas,Paradas
4,Cachorro,Chico,Hembra,Liso,Negro,,,Corto,Marron oscuro,Corto,Corto,Cortas,Caidas
...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,Adulto,Chico,Macho,Bicolor,Dorado,Blanco,,Corto,Marron oscuro,Mediano,Mediano,Mediano,Caidas
316,Cachorro,Mediano,Macho,Tricolor,Negro,Blanco,Marron,Largo,Marron oscuro,Corto,Corto,Mediano,Caidas
317,Cachorro,Mediano,Macho,Liso,Marron,,,Corto,Marron claro,Largo,Mediano,Cortas,Caidas
318,Adulto,Chico,Hembra,Bicolor,Negro,Marron,,Largo,Marron oscuro,Mediano,Corto,Largas,Caidas


### Imprimo los tipos de cada columna

In [5]:
print(prepared_data_train.dtypes)


Edad                 object
Tamaño               object
Sexo                 object
Patron de pelaje     object
Color de pelaje 1    object
Color de pelaje 2    object
Color de pelaje 3    object
Largo de pelaje      object
Color de ojos        object
Largo de hocico      object
Largo de cola        object
Largo de orejas      object
Tipo de orejas       object
dtype: object


### Obtengo la lista de etiquetas de las categorias

In [6]:
dataset_labels = prepared_data_train.columns.to_list()
dataset_labels

['Edad',
 'Tamaño',
 'Sexo',
 'Patron de pelaje',
 'Color de pelaje 1',
 'Color de pelaje 2',
 'Color de pelaje 3',
 'Largo de pelaje',
 'Color de ojos',
 'Largo de hocico',
 'Largo de cola',
 'Largo de orejas',
 'Tipo de orejas']

In [7]:
mascotas_train_ids = data_train.Mascota
print(mascotas_train_ids)

0         1
1         2
2         3
3         4
4         5
       ... 
315    1910
316    1919
317    1940
318    1950
319    1954
Name: Mascota, Length: 320, dtype: int64


Obtengo las categorias en base a las que no son numericas, en este caso todas son categorias

In [8]:
categorical_features_indices = np.where(prepared_data_train.dtypes != np.float)[0]
categorical_features_indices

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

# Preparando el dataset de testeo

In [10]:
dataset_test = pandas.read_csv('./data/perrosTestComplete.csv')
dataset_test.head()

null_value_stats = dataset_test.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

dataset_test.fillna("NaN", inplace=True)
prepared_dataset_test = dataset_test.drop('Mascota', axis=1)
prepared_dataset_test.head()

print(prepared_dataset_test.dtypes)

mascotas_test_ids = dataset_test.Mascota
mascotas_test_ids.head()

Edad                 object
Tamaño               object
Sexo                 object
Patron de pelaje     object
Color de pelaje 1    object
Color de pelaje 2    object
Color de pelaje 3    object
Largo de pelaje      object
Color de ojos        object
Largo de hocico      object
Largo de cola        object
Largo de orejas      object
Tipo de orejas       object
dtype: object


0    4642
Name: Mascota, dtype: int64

In [11]:
dataset_test = pandas.read_csv('./data/perrosTestComplete.csv')
dataset_test.head()

Unnamed: 0,Mascota,Edad,Tamaño,Sexo,Patron de pelaje,Color de pelaje 1,Color de pelaje 2,Color de pelaje 3,Largo de pelaje,Color de ojos,Largo de hocico,Largo de cola,Largo de orejas,Tipo de orejas
0,4642,Cachorro,Chico,Hembra,Bicolor,Blanco,Marron,,Mediano,Celeste,Largo,Largo,Largas,Paradas


In [12]:
null_value_stats = dataset_test.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Color de pelaje 3    1
dtype: int64

In [13]:
dataset_test.fillna("NaN", inplace=True)
prepared_dataset_test = dataset_test.drop('Mascota', axis=1)
prepared_dataset_test.head()

Unnamed: 0,Edad,Tamaño,Sexo,Patron de pelaje,Color de pelaje 1,Color de pelaje 2,Color de pelaje 3,Largo de pelaje,Color de ojos,Largo de hocico,Largo de cola,Largo de orejas,Tipo de orejas
0,Cachorro,Chico,Hembra,Bicolor,Blanco,Marron,,Mediano,Celeste,Largo,Largo,Largas,Paradas


In [14]:
print(prepared_dataset_test.dtypes)

Edad                 object
Tamaño               object
Sexo                 object
Patron de pelaje     object
Color de pelaje 1    object
Color de pelaje 2    object
Color de pelaje 3    object
Largo de pelaje      object
Color de ojos        object
Largo de hocico      object
Largo de cola        object
Largo de orejas      object
Tipo de orejas       object
dtype: object


In [15]:
mascotas_test_ids = dataset_test.Mascota
mascotas_test_ids.head()

0    4642
Name: Mascota, dtype: int64

# Creando y entrenando el modelo

### Creacion del modelo

In [16]:
train_dataset = Pool(data=prepared_data_train,
                     label=mascotas_train_ids,
                        cat_features=categorical_features_indices)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2000,
                           learning_rate=0.15,
                           depth=3,
                           loss_function='MultiClass',
                           nan_mode='Min')

### Cargando un model

In [17]:
model2 = CatBoostClassifier().load_model("models/model1.cbm", format='cbm')
model3 = CatBoostClassifier().load_model(
    "models/modelDepth3.cbm", format='cbm')
model4 = CatBoostClassifier().load_model(
    "models/modelDepth3Ite100.cbm", format='cbm')
model_100it_10d = CatBoostClassifier().load_model(
    "models/modelDepth10Ite100.cbm", format='cbm')
model_2000it_3d = CatBoostClassifier().load_model(
    "models/modelIt2000Lr015D3.cbm", format='cbm')


# **Entrenamiento**

In [32]:
model.fit(train_dataset)

model.save_model("models/modelIt2000Lr015D3.cbm",
                    format="cbm",
                    export_parameters=None,
                    pool=None)

model.save_model("models/modelIt2000Lr015D3.json",
                    format="json",
                    export_parameters=None,
                    pool=None)


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 5.7547793	total: 11.5s	remaining: 6h 24m 8s
1:	learn: 5.7253826	total: 25.3s	remaining: 7h 47s
2:	learn: 5.7080736	total: 39s	remaining: 7h 12m 50s
3:	learn: 5.6988660	total: 52.6s	remaining: 7h 17m 37s
4:	learn: 5.6933139	total: 1m 6s	remaining: 7h 22m 52s
5:	learn: 5.6725819	total: 1m 20s	remaining: 7h 23m 39s
6:	learn: 5.6429954	total: 1m 34s	remaining: 7h 27m 7s
7:	learn: 5.6366468	total: 1m 47s	remaining: 7h 27m 30s
8:	learn: 5.6164734	total: 2m 1s	remaining: 7h 27m 49s
9:	learn: 5.5852525	total: 2m 15s	remaining: 7h 28m 56s
10:	learn: 5.5639525	total: 2m 28s	remaining: 7h 28m 49s
11:	learn: 5.5345584	total: 2m 42s	remaining: 7h 28m 32s
12:	learn: 5.5154190	total: 2m 55s	remaining: 7h 28m 7s
13:	learn: 5.4962034	total: 3m 7s	remaining: 7h 23m 13s
14:	learn: 5.4703916	total: 3m 21s	remaining: 7h 23m 58s
15:	learn: 5.4398001	total: 3m 34s	remaining: 7h 22m 53s
16:	learn: 5.4244643	total: 3m 46s	remaining: 7h 20m 24s
17:	learn: 5.3928621	total: 3m 58s	remaining: 7h 18m 8s
1

# Guardando el model

In [183]:
model.save_model("models/modelDepth3Ite100.cbm",
                    format="cbm",
                    export_parameters=None,
                    pool=None)

model.save_model("models/modelDepth3Ite100.json",
                    format="json",
                    export_parameters=None,
                    pool=None)


# Generar predicciones

In [18]:
eval_dataset = Pool(data=prepared_dataset_test,
                    label=mascotas_test_ids,
                    cat_features=categorical_features_indices)


In [19]:
# Get predicted classes
# preds_class = model.predict(eval_dataset)
preds_class2 = model2.predict(eval_dataset)
preds_class3 = model3.predict(eval_dataset)
preds_class4 = model4.predict(eval_dataset)
preds_class_100i_10d = model_100it_10d.predict(eval_dataset)
preds_class_2000i_3d = model_2000it_3d.predict(eval_dataset)

# Get predicted probabilities for each class
# preds_proba = model.predict_proba(eval_dataset)
preds_proba2 = model2.predict_proba(eval_dataset)
preds_proba3 = model3.predict_proba(eval_dataset)
preds_proba4 = model4.predict_proba(eval_dataset)
preds_proba_100i_10d = model_100it_10d.predict_proba(eval_dataset)
preds_proba_2000i_3d = model_2000it_3d.predict_proba(eval_dataset)


CatBoostError: There is no trained model to use predict(). Use fit() to train model. Then use this method.

In [101]:
''' print("1: ",preds_class)
print("2: ",preds_class2)
print("3: ",preds_class3)
print("4: ",preds_class4)
print("5: ",preds_class_100i_10d) '''

' print("1: ",preds_class)\nprint("2: ",preds_class2)\nprint("3: ",preds_class3)\nprint("4: ",preds_class4)\nprint("5: ",preds_class_100i_10d) '

## Probabilidades
Un array por cada mascota de testeo

In [102]:
# print(preds_proba[0])
# print(preds_proba2[0])
# print(preds_proba3[0])
# print(preds_proba4[0])
print(preds_proba_100i_10d[0])

[3.98752534e-03 4.99077565e-04 5.08235877e-04 3.30350429e-04
 6.99152417e-04 3.33559852e-03 4.11147909e-04 6.38998539e-04
 4.70830506e-04 5.95055616e-04 5.72701484e-04 8.68996434e-04
 1.63045783e-03 3.58943921e-04 3.86729540e-04 5.63253054e-04
 2.03818197e-03 9.32459096e-04 8.35162975e-04 6.64554134e-04
 2.78118246e-04 3.15105751e-04 6.33582681e-04 4.80834114e-03
 5.24991961e-04 7.96576959e-01 1.60138856e-03 3.25676177e-04
 1.42041172e-03 1.04554379e-03 4.27338684e-03 4.59550925e-04
 7.14720999e-04 8.31423108e-04 2.09722268e-03 2.03211838e-04
 5.25164967e-04 4.21466223e-04 3.32070532e-04 4.24059549e-03
 9.02419588e-04 2.18505493e-03 5.38003856e-04 4.47730283e-04
 1.38126641e-03 3.42529680e-04 7.46793915e-04 7.87072657e-04
 6.34176625e-04 9.74927123e-04 5.41847117e-04 7.32271319e-04
 6.31440043e-04 4.62195092e-04 3.98125461e-04 1.46305787e-03
 3.15477498e-04 5.48914658e-04 2.72524817e-04 4.12019337e-04
 5.37930194e-04 3.22958046e-04 6.84419844e-04 9.79061698e-04
 3.95745587e-04 4.986931

In [90]:
''' contador = 0
while contador < 320:
    if(preds_proba_100i_10d[0][contador] == 0.031220734475029422):
        print(contador, " --> ", preds_proba_100i_10d[0][contador])
    contador = contador +1

print(preds_proba_100i_10d[0][116]) '''

id_mascota = np.where(data_train.Mascota == 3789)[0]

print(preds_proba_100i_10d[0][id_mascota])


[0.00375544]


### Supuestas predicciones

In [23]:
mascota_numero = 0

for prediccion in preds_class:
    probabilidad = math.trunc((preds_proba[mascota_numero][prediccion.index()])*100)
    print("-- Mascota "+str(mascotas_test_ids[mascota_numero])+" es similar a la mascota "+str(prediccion.item())
        +" => "+str(probabilidad)+"% de probabilidad")
    # print(str(preds_proba[mascota_numero]))
    print(dataset_test.loc[mascota_numero:mascota_numero, "Edad":"Tipo de orejas"].values)
    print(prepared_data_train.loc[prediccion.item():prediccion.item(), "Edad":"Tipo de orejas"].values)
    mascota_numero+=1


AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [117]:
class Mascota:
    def __init__(self,index, id, prob):
        self.id = id
        self.index = index
        self.prob = prob

    def to_string(self):
        return "{}\t|\t{}\t=>\t{}%".format(self.index,self.id,self.prob)

    def get_id(self):
        return self.id
    
    def get_prob(self):
        return self.prob

    def get_index(self):
        return self.index

In [151]:
def keyValue(list_proba):
    mascota_test_index = 0
    # mascota_probs = list_proba[0]

    mascota_test_id = str(mascotas_test_ids[mascota_test_index].item())
    print(mascota_test_id)
    
    mascotas_probs = []

    for prob_index in range(0, len(list_proba[mascota_test_index])):
        probabilidad = list_proba[mascota_test_index][prob_index]*100
        # probabilidad = list_proba[mascota_test_index][prob_index]

        # print(probabilidad)
        # print(mascotas_train_ids[prob_index], " --> ",
            #   list_proba[0][prob_index], " >> ", probabilidad)
        # print(list_proba[mascota_test_index][41])
        # print(mascotas_train_ids[prob_index],"=>","{0:.2f}".format(probabilidad),"%")

        mascotas_probs.append(
            Mascota(prob_index,mascotas_train_ids[prob_index], probabilidad))
        
    # print(len(mascotas_probs))

    def orderProb(n):
        return n.get_prob()

    def orderId(n):
        return n.get_id()

    def orderIndex(n):
        return n.get_index()

    mascotas_probs.sort(key=orderProb, reverse=True)

    for mascota in mascotas_probs:
        print(mascota.to_string())

    # print(mascotas_probs[320].to_string())

# keyValue(preds_proba4)
keyValue(preds_proba_100i_10d)


4642
215	|	1498	=>	3.9255787608286887%
24	|	25	=>	2.51968347060337%
201	|	4352	=>	1.729381738494203%
131	|	3917	=>	1.6341333621825709%
218	|	4397	=>	1.6214734722144606%
48	|	49	=>	1.6015437879610932%
208	|	1486	=>	1.3260435411644071%
284	|	6079	=>	1.193563561596375%
53	|	1002	=>	1.1069461086917456%
274	|	4535	=>	0.9576348706420962%
220	|	1508	=>	0.8911243203584456%
1	|	2	=>	0.8721667713418652%
46	|	47	=>	0.8576233776809529%
233	|	4413	=>	0.8523271820279781%
7	|	8	=>	0.7380656675288473%
52	|	61	=>	0.7347977299819412%
192	|	4340	=>	0.7159522367344856%
204	|	4353	=>	0.6929885916700016%
302	|	1821	=>	0.6828430697360152%
10	|	11	=>	0.6443488423634459%
32	|	33	=>	0.6364626349349606%
239	|	1531	=>	0.5901023933655573%
62	|	66	=>	0.5884668378131063%
202	|	1476	=>	0.5695533291071161%
60	|	65	=>	0.5648094450830224%
296	|	1776	=>	0.5605334662085063%
205	|	1477	=>	0.5337416267633425%
232	|	1526	=>	0.52039358051713%
18	|	19	=>	0.5138864784781985%
197	|	4642	=>	0.5030344757041586%
314	|	1908	=>	0.489

In [None]:
# 315 --> 0.0027274134200535954
