In [1]:
# Importamos las librerias necesarias
from sklearn.model_selection import train_test_split
from sklearn import linear_model

import pandas as pd

In [2]:
# Importamos el dataset para manejarlo como un Dataframe
df = pd.read_csv('sloth_data.csv')

In [3]:
df.sample(frac=0.65)

Unnamed: 0,id,claw_length_cm,endangered,size_cm,specie,sub_specie,tail_length_cm,weight_kg
19,19,8.271,critically_endangered,51.060,three_toed,Pygmy three-toed sloth,6.174,2.575
559,559,9.428,vulnerable,61.877,three_toed,Maned three-toed sloth,5.315,7.341
2739,2739,9.575,least_concern,66.399,two_toed,Linnaeus’s two-toed sloth,-0.564,5.485
2638,2638,7.578,least_concern,59.674,three_toed,Brown-throated sloth,5.742,4.251
1126,1126,6.975,least_concern,49.589,three_toed,Pale-throated sloth,5.650,5.019
...,...,...,...,...,...,...,...,...
1411,1411,9.131,least_concern,48.962,three_toed,Pale-throated sloth,3.964,3.315
4272,4272,5.562,least_concern,63.963,two_toed,Hoffman’s two-toed sloth,3.072,3.758
567,567,7.826,vulnerable,63.109,three_toed,Maned three-toed sloth,4.463,4.660
1191,1191,7.056,least_concern,48.578,three_toed,Pale-throated sloth,4.815,4.441


In [4]:
# Famaliarizarse con el dataframe luego hacer la limpieza de datos
df.specie.unique()
df.specie = df.specie.map({'three_toed': 0, 'two_toed': 1})

In [5]:
df.sub_specie.unique()
df.sub_specie = df.sub_specie.map({'Pygmy three-toed sloth': 0, 'Maned three-toed sloth': 1,
       'Pale-throated sloth': 2, 'Brown-throated sloth': 3,
       'Linnaeus’s two-toed sloth': 4, 'Hoffman’s two-toed sloth': 5})

In [6]:
df.endangered.unique()
df.endangered = df.endangered.map({'critically_endangered': 0, 'vulnerable': 1, 'least_concern': 2})

In [7]:
df.describe()

Unnamed: 0,id,claw_length_cm,endangered,size_cm,specie,sub_specie,tail_length_cm,weight_kg
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2499.5,7.423503,1.8634,60.399852,0.4676,3.2386,3.410544,5.253253
std,1443.520003,1.520533,0.383627,5.929968,0.498999,1.378713,2.333288,1.268203
min,0.0,1.748,0.0,46.928,0.0,0.0,-2.942,0.946
25%,1249.75,6.38375,2.0,59.90475,0.0,2.0,1.44025,4.3825
50%,2499.5,7.445,2.0,62.4785,0.0,3.0,3.812,5.274
75%,3749.25,8.4915,2.0,64.39825,1.0,4.0,5.35125,6.12525
max,4999.0,12.171,2.0,68.76,1.0,5.0,8.538,9.997


In [8]:
# Analisis de correlacion
corr = df.corr()
corr.style.background_gradient()

Unnamed: 0,id,claw_length_cm,endangered,size_cm,specie,sub_specie,tail_length_cm,weight_kg
id,1.0,-0.259023,0.548632,0.551364,0.864205,0.972987,-0.613823,0.290017
claw_length_cm,-0.259023,1.0,-0.019271,0.079532,-0.060126,-0.236798,-0.153307,0.129315
endangered,0.548632,-0.019271,1.0,-0.013777,0.333736,0.633489,-0.242893,0.04621
size_cm,0.551364,0.079532,-0.013777,1.0,0.638633,0.54958,-0.551956,0.440112
specie,0.864205,-0.060126,0.333736,0.638633,1.0,0.865945,-0.848246,0.491309
sub_specie,0.972987,-0.236798,0.633489,0.54958,0.865945,1.0,-0.621538,0.299078
tail_length_cm,-0.613823,-0.153307,-0.242893,-0.551956,-0.848246,-0.621538,1.0,-0.505609
weight_kg,0.290017,0.129315,0.04621,0.440112,0.491309,0.299078,-0.505609,1.0


In [18]:
huber = linear_model.HuberRegressor()

In [19]:
# Escogemos sub_specie como variable dependiente y el resto como independientes porque son las que tienen mayor correlacion
data_x = df.drop(['claw_length_cm', 'tail_length_cm', 'sub_specie', 'weight_kg', 'size_cm'], axis=1)
data_y = df.sub_specie

In [20]:
data_x

Unnamed: 0,id,endangered,specie
0,0,0,0
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
...,...,...,...
4995,4995,2,1
4996,4996,2,1
4997,4997,2,1
4998,4998,2,1


In [21]:
data_y

0       0
1       0
2       0
3       0
4       0
       ..
4995    5
4996    5
4997    5
4998    5
4999    5
Name: sub_specie, Length: 5000, dtype: int64

In [22]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y)

In [23]:
# Entrenamos el modelo
huber.fit(train_x, train_y)

In [24]:
# Ecuacion de la recta
print(f'y = {round( huber.coef_[0], 2)}x + {round(huber.intercept_, 2)}')

y = 0.0x + 0.2


In [25]:
# Veamos la calificacion
print(f'Score: {round(huber.score(test_x, test_y), 4) * 100}%')

Score: 97.00999999999999%


In [44]:
# Vamos a predecir y dar formato al resultado
id = 2739
specie = 1
endangered = 2
labels1 = {0: 'Pygmy three-toed sloth', 1: 'Maned three-toed sloth',
       2: 'Pale-throated sloth', 3: 'Brown-throated sloth',
       4: 'Linnaeus’s two-toed sloth', 5: 'Hoffman’s two-toed sloth'}
labels2 = {0: 'critically_endangered', 1: 'vulnerable', 2: 'least_concern'}
predict = huber.predict([[id, specie, endangered]])
print(f'La sub especie con id: {id}, es: {labels1[int(round(predict[0], 0))]} y su estado de conservacion es: {labels2[endangered]}')



La sub especie con id: 2739, es: Linnaeus’s two-toed sloth y su estado de conservacion es: least_concern
4.0


