In [1]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
## cargando el dataset
df = pd.read_csv('dataset_fish.csv')

In [3]:
## vamos el tipo de datos del dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [4]:
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [5]:
## distribucion de las especies
df.Species.unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [6]:
## convertimos elos nombres de las especies a numeros
df.Species = df.Species.map({'Bream':1, 'Roach':2, 'Whitefish':3, 'Parkki':4, 'Perch':5, 'Pike':6, 'Smelt':7})

In [7]:
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,1,242.0,23.2,25.4,30.0,11.52,4.02
1,1,290.0,24.0,26.3,31.2,12.48,4.3056
2,1,340.0,23.9,26.5,31.1,12.3778,4.6961
3,1,363.0,26.3,29.0,33.5,12.73,4.4555
4,1,430.0,26.5,29.0,34.0,12.444,5.134


In [8]:
df.Species.unique()

array([1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [9]:
df.describe()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0,159.0
mean,3.880503,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,2.026298,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,1.0,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,2.0,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,5.0,273.0,25.2,27.3,29.4,7.786,4.2485
75%,5.0,650.0,32.7,35.5,39.65,12.3659,5.5845
max,7.0,1650.0,59.0,63.4,68.0,18.957,8.142


In [10]:
## rredondemos los valores
corr = round(df.corr(), 4)
corr.style.background_gradient()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
Species,1.0,-0.1768,-0.1012,-0.1184,-0.2095,-0.697,-0.3152
Weight,-0.1768,1.0,0.9157,0.9186,0.923,0.7243,0.8865
Length1,-0.1012,0.9157,1.0,0.9995,0.992,0.6254,0.867
Length2,-0.1184,0.9186,0.9995,1.0,0.9941,0.6404,0.8735
Length3,-0.2095,0.923,0.992,0.9941,1.0,0.7034,0.8785
Height,-0.697,0.7243,0.6254,0.6404,0.7034,1.0,0.7929
Width,-0.3152,0.8865,0.867,0.8735,0.8785,0.7929,1.0


In [12]:
data_x = df.drop(['Weight'], axis=1)
data_y = df.Weight

In [13]:
data_x

Unnamed: 0,Species,Length1,Length2,Length3,Height,Width
0,1,23.2,25.4,30.0,11.5200,4.0200
1,1,24.0,26.3,31.2,12.4800,4.3056
2,1,23.9,26.5,31.1,12.3778,4.6961
3,1,26.3,29.0,33.5,12.7300,4.4555
4,1,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...
154,7,11.5,12.2,13.4,2.0904,1.3936
155,7,11.7,12.4,13.5,2.4300,1.2690
156,7,12.1,13.0,13.8,2.2770,1.2558
157,7,13.2,14.3,15.2,2.8728,2.0672


In [14]:
data_y

0      242.0
1      290.0
2      340.0
3      363.0
4      430.0
       ...  
154     12.2
155     13.4
156     12.2
157     19.7
158     19.9
Name: Weight, Length: 159, dtype: float64

In [15]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2)

In [16]:
## creamos el modelo
model = Ridge()

In [17]:
## entrenamos el modelo
model.fit(train_x, train_y)

In [18]:
## evaluamos el modelo
y_predicted = model.predict(test_x)

In [19]:
train_x

Unnamed: 0,Species,Length1,Length2,Length3,Height,Width
49,2,22.0,23.4,26.7,6.9153,3.6312
58,3,28.5,31.0,34.0,10.7440,6.5620
6,1,26.8,29.7,34.5,14.1795,5.2785
26,1,32.0,35.0,40.6,16.3618,6.0900
150,7,10.8,11.3,12.6,1.9782,1.2852
...,...,...,...,...,...,...
47,2,21.1,22.5,25.0,6.4000,3.8000
1,1,24.0,26.3,31.2,12.4800,4.3056
30,1,35.0,38.5,44.1,18.0369,6.3063
121,5,37.0,40.0,42.4,12.3808,7.4624


In [20]:
print(f'Calificacion de prediccion:{round(model.score(test_x, test_y),4) * 100 }%')

Calificacion de prediccion:89.49000000000001%
