In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # for heatmap
import matplotlib.pyplot as plt # for common plotting
import graphviz as gv # for decision tree plotting
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# GaussianNB, BernoulliNB, MultinomialNB
from sklearn.naive_bayes import GaussianNB as bayes_model
from sklearn.metrics import r2_score as model_score, classification_report, confusion_matrix
from sklearn import tree

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
animals = pd.read_csv("../input/train/train.csv")
test = pd.read_csv("../input/test/test.csv")
# Any results you write to the current directory are saved as output.
animals.head(3)

* Type - Type of animal (1 = Dog, 2 = Cat)
* Name - Name of pet (Empty if not named)
* Age - Age of pet when listed, in months
* Breed1 - Primary breed of pet (Refer to BreedLabels dictionary)
* Breed2 - Secondary breed of pet, if pet is of mixed breed (Refer to BreedLabels dictionary)
* Gender - Gender of pet (1 = Male, 2 = Female, 3 = Mixed, if profile represents group of pets)
* Color1 - Color 1 of pet (Refer to ColorLabels dictionary)
* Color2 - Color 2 of pet (Refer to ColorLabels dictionary)
* Color3 - Color 3 of pet (Refer to ColorLabels dictionary)
* MaturitySize - Size at maturity (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified)
* FurLength - Fur length (1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified)
* Vaccinated - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
* Dewormed - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)
* Sterilized - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)
* Health - Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)
* Quantity - Number of pets represented in profile
* Fee - Adoption fee (0 = Free)
* State - State location in Malaysia (Refer to StateLabels dictionary)
* RescuerID - Unique hash ID of rescuer
* VideoAmt - Total uploaded videos for this pet
* Description - Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese.
* PetID - Unique hash ID of pet profile
* PhotoAmt - Total uploaded photos for this pet
* AdoptionSpeed - Categorical speed of adoption. Lower is faster. This is the value to predict. See below section for more info.

In [None]:
animals.describe()

Tipos de Variables
Cualitativas:
* Type
* Name
* Breed1
* Breed2
* Gender
* Color1
* Color2
* Color3
* Vaccinated
* Dewormed
* Sterilized
* State
* RescuerID
* Description
* PetID

Cuantitativas
* MaturitySize
* FurLength
* Helmuth
* Quantity
* Fee
* VideoAmt
* PhotoAmt
* AdoptionSpeed

In [None]:
animalsBySpeed = animals.groupby(['AdoptionSpeed']).size().reset_index(name='Count')
dataToPlot = pd.DataFrame({'Percentage': pd.Series(animalsBySpeed.apply(lambda row: row['Count']/len(animals),axis=1), index=animalsBySpeed.index)})
dataToPlot.set_index([['Same Day','1st week','1st Month','2nd & 3rd Month','Other']], inplace=True)
dataToPlot

In [None]:
dataToPlot.plot.pie(y='Percentage', figsize=(10,10))

Se consideró si difería la cantidad de adopciones según el tiempo que la mascota teniene sin ser adoptada. Con los datos anteriores solo se puede afirmar que las mascotas con más de 3 meses poseen menos probabilidad de ser adoptadas.

In [None]:
animalsByType = animals.groupby(['Type']).size().reset_index(name='Count')
animalsByType.plot.barh(y='Count',x='Type')

En la base de datos que posee PetFinder hay más caninos que feninos

In [None]:
animals['Age'].corr(animals['AdoptionSpeed'])

Suponiendo que las personas prefieren animales más jovenes para adoptar, esto parece que no sucede en PetFinder donde existen mascotas con menos de un mes hasta mascotas con más 20 años.

In [None]:
corr = animals.corr()
# plot the heatmap
f, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, linewidths=.5, square=True)

Luego de obtener el resultado se buscó correlación significativa para las demás variables pero unicamente existe esta para las variables Vaccinated, Dewormed, Sterilzed.

**Grafica de Codo:**

In [None]:
animalsBow = animals._get_numeric_data()
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(animalsBow)
    kmeanModel.fit(animalsBow)
    distortions.append(sum(np.min(cdist(animalsBow, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / animalsBow.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

Al realizar la grafica codo se considera utilizar 6 para cantidad de grupos del cluster

**Agrupación con K-Medias:**

In [None]:
features = animals[['Age', 'MaturitySize', 'FurLength', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'AdoptionSpeed']]
kmeans = KMeans(n_clusters = 6).fit_predict(features)
rows, cols = features.shape
fig, axs = plt.subplots(cols, cols,figsize=(50, 50))
for i in range(0, cols):
    for j in range(0, cols):
        if j != i:
            axs[i, j].scatter(features.iloc[:,i], features.iloc[:,j], c=kmeans)
            axs[i,j].set(xlabel=features.columns[i],ylabel=features.columns[j])
plt.show()

 **Predicción usando un árbol de decisiones:**

In [None]:
#features = animals.drop(['Name', 'RescuerID', 'PetID', 'Description', 'Quantity', 'Vaccinated', 'AdoptionSpeed'], axis=1)
features = animals[['Age', 'MaturitySize', 'FurLength', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt']]
speed = animals[['AdoptionSpeed']]
classifier = tree.DecisionTreeClassifier().fit(features, speed)
cleaned_test = test[['Age', 'MaturitySize', 'FurLength', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt']]
#cleaned_test = test.drop(['Name', 'RescuerID', 'PetID', 'Description', 'Quantity', 'Vaccinated'], axis=1)
prediction = classifier.predict(cleaned_test)
prediction

In [None]:
graph_data = tree.export_graphviz(classifier, out_file=None, max_depth=5)
graph = gv.Source(graph_data)
graph

In [None]:
result = test[['PetID']].assign(AdoptionSpeed=pd.Series(prediction))
result.to_csv('submission.csv', index=False)

La precisión del modelo fue de 0.169, por lo que se concluye que el modelo basado en un árbol de decisiones con las variables utilizadas no es nada eficaz.

**Predicción de la variable *Fee* utilizando un modelo de Regresión Lineal:**

In [None]:
myvar = animals[['Dewormed','Vaccinated','Sterilized','Fee']].copy()
myvar['Total'] = myvar.sum(axis=1)
corr = myvar.corr()
# plot the heatmap
f, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, linewidths=.5, square=True)

In [None]:
myvar['Total'].corr(myvar['Fee'])

In [None]:
fig, axs = plt.subplots(2, 1)

#Con datos del dataset train
#data_x = myvar.drop('Fee', axis=1)
data_x = myvar[['Total']].copy()
data_y = myvar[['Fee']].copy()
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3, random_state=42)
lm = LinearRegression()
lm.fit(train_x, train_y)
pred = lm.predict(test_x)
axs[0].scatter(test_y, pred)
print('Puntuación usando train:', model_score(test_y, pred))
#Con datos del dataset test
lm2 = LinearRegression()
lm2.fit(data_x, data_y)
real_test = test[['Dewormed','Vaccinated','Sterilized']].copy()
real_test['Total'] = real_test.sum(axis=1)
pred2 = lm2.predict(real_test[['Total']].copy())
axs[1].scatter(test[['Fee']], pred2, color='red')
print('Puntuación usando test:', model_score(test[['Fee']], pred2))


Utilizando la suma de las variables **Dewormed**, **Vaccinated** y **Sterilized** para predecir los valores de la variable **Fee** en un modelo de regresión lineal, se observó un caso de *Overfitting*. Como se puede observar en la sección anterior, entrenando el modelo con una porción del dataset "train" y utilizando el resto para realizar la predicción se obtuvo un puntaje de 0.99 (valor obtenido al utilizar la función R^2 sobre la predicción y los valores reales). En contraste, al usar todo el dataset "train" para entrenar el modelo y el dataset "test" para realizar las predicciones, se obtuvo un puntaje de -0.103, lo cual muestra que el modelo no funciona al utilizar datos totalmente ajenos a los datos de entrenamiento.

**Naive Bayes:**

In [None]:
animals2 = animals._get_numeric_data()
x_train, x_test = train_test_split(animals2, test_size=0.3, random_state=1)
gnb = bayes_model()
used_features = list(animals2)[:len(list(animals2))-2]
print(used_features)
# Train classifier
gnb.fit(
    x_train[used_features].values,
    x_train["AdoptionSpeed"]
)
pred = gnb.predict(x_test[used_features])
print('Puntuación usando test:', model_score(x_test['AdoptionSpeed'], pred))
print(classification_report(x_test['AdoptionSpeed'], pred))
print(confusion_matrix(x_test['AdoptionSpeed'], pred))

Como se puede ver en la puntuación del modelo usando el test de R^2 y en el reporte de clasificación, el modelo de bayes tampoco fue eficaz para la predicción de la velocidad de adopción.