In [41]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import random

In [42]:
# Creación de objeto pandas dataframe
patients_df=pd.read_csv('https://github.com/stedy/Machine-Learning-with-R-datasets/blob/master/insurance.csv?raw=true')

In [43]:
# Creación variables binarias
patients_df.replace({'sex':{'male':0,'female':1}}, inplace=True)
patients_df.replace({'smoker':{'yes':1,'no':0}}, inplace=True)

In [44]:
# La función get dummies convierte un DataFrame de columnas categoricas a uno con variables dummy variables
region_dummies_df=pd.get_dummies(patients_df[['region']])
region_dummies_df

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest
0,False,False,False,True
1,False,False,True,False
2,False,False,True,False
3,False,True,False,False
4,False,True,False,False
...,...,...,...,...
1333,False,True,False,False
1334,True,False,False,False
1335,False,False,True,False
1336,False,False,False,True


In [45]:
# Hacemos join entre los 2 dataframes para reconstruir el dataset
patients_df = patients_df.join(region_dummies_df)

In [46]:
patients_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27.900,0,1,southwest,16884.92400,False,False,False,True
1,18,0,33.770,1,0,southeast,1725.55230,False,False,True,False
2,28,0,33.000,3,0,southeast,4449.46200,False,False,True,False
3,33,0,22.705,0,0,northwest,21984.47061,False,True,False,False
4,32,0,28.880,0,0,northwest,3866.85520,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,northwest,10600.54830,False,True,False,False
1334,18,1,31.920,0,0,northeast,2205.98080,True,False,False,False
1335,18,1,36.850,0,0,southeast,1629.83350,False,False,True,False
1336,21,1,25.800,0,0,southwest,2007.94500,False,False,False,True


In [47]:
#$patients_df = patients_df.drop(['region'],axis = 1)

In [48]:
#matrix = np.array(patients_df, 'float')

In [49]:
# Uso 70% para entrenamiento (random split)
train_df= patients_df.sample(frac=0.7,random_state=200)
rest_df = patients_df.drop(train_df.index)
# Uso 15% para validacion y 15% para test
val_df=rest_df.sample(frac=0.5,random_state=200)
test_df=rest_df.drop(val_df.index)

In [50]:
bool_cols = ['region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']
train_df[bool_cols] = train_df[bool_cols].astype(int)
X_train = train_df.drop(['region', 'charges'], axis=1).values
Y_train = train_df['charges'].values

In [51]:

def fit_model(X, Y, lr=0.00001, epochs=100, patience=10): 
    """Implementación de la función de entrenamiento por descendo de gracdiente

    Args:
        X (np array): vector de características nxm
        Y (np array): vector de variable objetivo
    """

    #Generacion de los thetas aleatorios
    n, m = X.shape
    theta = np.random.rand(m+1,1)
    #Se agrega la dimension
    X_c = np.hstack((np.ones((n,1)),X))
    loss_v = []
    best_loss = np.inf

    for epoch in range(epochs):
        Y_est = X_c.dot(theta)
        #Calcular la perdidad
        loss = np.sum(np.power(Y_est-Y,2))/(2.*n)
        loss_v.append(loss)
        #calculo gradientes
        gradientes = (-1/n)*(X_c.T.dot((Y - X_c.dot(theta))))
        #actualizar
        theta = theta - lr*gradientes

        # Esto se agrega para parar el entrenamiento en caso de que la perdida no disminuya por mas del valor del parametro patience
        if loss < best_loss:
            best_loss = loss
            best_theta = np.copy(theta)
            epochs_stall = 0
        else:
            epochs_stall+=1
        if epochs_stall>=patience:
            print('La funcion de perdida no ha disminuido, parando despues de {} epocas. el error es: {}'.format(epoch, loss))
            break
        print('Epoch: {} Loss: {:.4e}'.format(epoch, loss))

    print('El error fue: {:.4e}'.format(loss))
    return best_theta, loss_v


In [52]:
thetas, loss_v = fit_model(X_train, Y_train, 0.00001, 1000, 20)

Epoch: 0 Loss: 1.5000e+11
Epoch: 1 Loss: 1.4265e+11
Epoch: 2 Loss: 1.3568e+11
Epoch: 3 Loss: 1.2908e+11
Epoch: 4 Loss: 1.2282e+11
Epoch: 5 Loss: 1.1688e+11
Epoch: 6 Loss: 1.1125e+11
Epoch: 7 Loss: 1.0591e+11
Epoch: 8 Loss: 1.0085e+11
Epoch: 9 Loss: 9.6051e+10
Epoch: 10 Loss: 9.1501e+10
Epoch: 11 Loss: 8.7187e+10
Epoch: 12 Loss: 8.3096e+10
Epoch: 13 Loss: 7.9217e+10
Epoch: 14 Loss: 7.5539e+10
Epoch: 15 Loss: 7.2052e+10
Epoch: 16 Loss: 6.8745e+10
Epoch: 17 Loss: 6.5610e+10
Epoch: 18 Loss: 6.2637e+10
Epoch: 19 Loss: 5.9818e+10
Epoch: 20 Loss: 5.7145e+10
Epoch: 21 Loss: 5.4610e+10
Epoch: 22 Loss: 5.2206e+10
Epoch: 23 Loss: 4.9927e+10
Epoch: 24 Loss: 4.7766e+10
Epoch: 25 Loss: 4.5717e+10
Epoch: 26 Loss: 4.3774e+10
Epoch: 27 Loss: 4.1931e+10
Epoch: 28 Loss: 4.0183e+10
Epoch: 29 Loss: 3.8526e+10
Epoch: 30 Loss: 3.6955e+10
Epoch: 31 Loss: 3.5465e+10
Epoch: 32 Loss: 3.4051e+10
Epoch: 33 Loss: 3.2711e+10
Epoch: 34 Loss: 3.1440e+10
Epoch: 35 Loss: 3.0235e+10
Epoch: 36 Loss: 2.9092e+10
Epoch: 37 L