# Predicción de diabetes 

### 1. Exploración de datos

In [1]:
# Importar librerías

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
# Cargar datos

train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

In [3]:
# Explorar datos de entrenamiento 

train_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.019913,0.05068,0.104809,0.070072,-0.035968,-0.026679,-0.024993,-0.002592,0.003709,0.040343,321.0
1,-0.01278,-0.044642,0.060618,0.052858,0.047965,0.029375,-0.017629,0.034309,0.070207,0.007207,215.0
2,0.038076,0.05068,0.008883,0.042529,-0.042848,-0.021042,-0.039719,-0.002592,-0.018114,0.007207,127.0
3,-0.01278,-0.044642,-0.023451,-0.040099,-0.016704,0.004636,-0.017629,-0.002592,-0.03846,-0.038357,64.0
4,-0.023677,-0.044642,0.045529,0.090729,-0.01808,-0.035447,0.07073,-0.039493,-0.034522,-0.009362,175.0


In [4]:
train_df.dtypes

age       float64
sex       float64
bmi       float64
bp        float64
s1        float64
s2        float64
s3        float64
s4        float64
s5        float64
s6        float64
target    float64
dtype: object

In [5]:
train_df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0
mean,-0.00033,-0.003922,0.002957,-0.000764,0.000377,0.001124,0.000368,-0.001172,-0.000469,0.002247,153.262136
std,0.047257,0.047229,0.04752,0.047536,0.047779,0.047496,0.046248,0.044466,0.044972,0.045749,75.124696
min,-0.107226,-0.044642,-0.090275,-0.112399,-0.126781,-0.106845,-0.098625,-0.076395,-0.126097,-0.129483,25.0
25%,-0.034575,-0.044642,-0.032073,-0.036656,-0.034592,-0.030124,-0.032356,-0.039493,-0.031988,-0.02593,89.0
50%,0.005383,-0.044642,-0.002973,-0.00567,-0.004321,-0.003193,-0.006584,-0.002592,-0.002398,0.003064,142.0
75%,0.038076,0.05068,0.036907,0.035644,0.030078,0.03188,0.02655,0.034309,0.032432,0.027917,210.0
max,0.110727,0.05068,0.170555,0.132044,0.153914,0.155887,0.177497,0.155345,0.133397,0.135612,336.0


In [6]:
train_df.isna().sum()

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

In [7]:
# Explorar los datos de prueba

test_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.0709,-0.044642,0.039062,-0.033213,-0.012577,-0.034508,-0.024993,-0.002592,0.067737,-0.013504,310.0
2,0.027178,0.05068,0.017506,-0.033213,-0.007073,0.045972,-0.065491,0.07121,-0.096435,-0.059067,69.0
3,0.045341,0.05068,0.060618,0.031065,0.028702,-0.047347,-0.054446,0.07121,0.133597,0.135612,245.0
4,-0.067268,0.05068,-0.012673,-0.040099,-0.015328,0.004636,-0.058127,0.034309,0.019196,-0.034215,202.0


In [8]:
test_df.dtypes

age       float64
sex       float64
bmi       float64
bp        float64
s1        float64
s2        float64
s3        float64
s4        float64
s5        float64
s6        float64
target    float64
dtype: object

In [9]:
test_df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,0.000767,0.009111,-0.00687,0.001776,-0.000876,-0.002611,-0.000854,0.002724,0.001091,-0.00522,149.511278
std,0.048621,0.047449,0.047315,0.047944,0.047414,0.047981,0.050834,0.054329,0.053431,0.051506,81.716214
min,-0.103593,-0.044642,-0.084886,-0.108956,-0.108893,-0.115613,-0.102307,-0.076395,-0.096435,-0.137767,37.0
25%,-0.038207,-0.044642,-0.039618,-0.032077,-0.033216,-0.034194,-0.039719,-0.039493,-0.041176,-0.042499,81.0
50%,0.005383,0.05068,-0.016984,-0.002228,-0.000193,-0.004759,-0.006584,-0.002592,-0.000612,-0.00522,129.0
75%,0.038076,0.05068,0.017506,0.035644,0.027326,0.021546,0.033914,0.034309,0.030564,0.027917,214.0
max,0.110727,0.05068,0.160855,0.107944,0.152538,0.198788,0.181179,0.185234,0.133597,0.135612,346.0


In [10]:
test_df.isna().sum()

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

In [11]:
# Preprocesar el conjunto de datos de entrenamiento
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']

In [12]:
# Preprocesar el conjunto de datos de prueba
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

### 2. Entrenamiento del modelo

In [13]:
# Ajustar el modelo a los datos de entrenamiento

model = LinearRegression()
model.fit(X_train, y_train)

### 3. Evaluación del modelo para entranamiento y prueba

In [14]:
# Realizar predicciones en los datos de entrenamiento

y_train_pred = model.predict(X_train)

In [15]:
# Evaluar el modelo de entrenamiento

mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)

In [16]:
mse, mae, r2

(np.float64(2902.0483821109683),
 np.float64(43.72789618474123),
 0.484122067464441)

In [17]:
# Realizar predicciones de prueba

y_pred = model.predict(X_test)

In [18]:
# Evaluar el modelo de prueba

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [19]:
mse, mae, r2

(np.float64(2855.8184535056253),
 np.float64(43.143476218056364),
 0.5690847834753974)

In [20]:
import pickle

# Guardar el modelo con pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)