In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [53]:
# Creación de objeto pandas datafr-ame
patients_df=pd.read_csv('https://github.com/stedy/Machine-Learning-with-R-datasets/blob/master/insurance.csv?raw=true')
patients_df.to_csv("./data.csv")

In [35]:
# Creación variables binarias
patients_df.replace({'sex':{'male':0,'female':1}}, inplace=True)
patients_df.replace({'smoker':{'yes':1,'no':0}}, inplace=True)

In [36]:
# La función get dummies convierte un DataFrame de columnas categoricas a uno con variables dummy variables
region_dummies_df=pd.get_dummies(patients_df[['region']])
region_dummies_df = region_dummies_df.replace().astype(int)
region_dummies_df

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [37]:
# Hacemos join entre los 2 dataframes para reconstruir el dataset
patients_df = patients_df.join(region_dummies_df)

In [38]:
patients_df = patients_df.drop(['region'], axis=1)
patients_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27.900,0,1,16884.92400,0,0,0,1
1,18,0,33.770,1,0,1725.55230,0,0,1,0
2,28,0,33.000,3,0,4449.46200,0,0,1,0
3,33,0,22.705,0,0,21984.47061,0,1,0,0
4,32,0,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,10600.54830,0,1,0,0
1334,18,1,31.920,0,0,2205.98080,1,0,0,0
1335,18,1,36.850,0,0,1629.83350,0,0,1,0
1336,21,1,25.800,0,0,2007.94500,0,0,0,1


In [42]:
# Uso 70% para entrenamiento (random split)
train_df= patients_df.sample(frac=0.7,random_state=200)
rest_df = patients_df.drop(train_df.index)
# Uso 15% para validacion y 15% para test
val_df=rest_df.sample(frac=0.5,random_state=200)
test_df=rest_df.drop(val_df.index)

In [56]:
class linear_regressor():
    def __init__(self):
        self.theta = None
    def fit_model(self, X, Y, lr=0.00001, epochs=100, patience=10): 
        """Implementación de la función de entrenamiento por descendo de gracdiente

        Args:
            X (np array): vector de características nxm
            Y (np array): vector de variable objetivo
        """

        #Generacion de los thetas aleatorios
        n, m = X.shape
        self.theta = np.random.rand(m+1,1)
        #Se agrega la dimension
        X_c = np.hstack((np.ones((n,1)),X))
        loss_v = []
        best_loss = np.inf

        for epoch in range(epochs):
            Y_est = X_c.dot(self.theta)
            #Calcular la perdidad
            loss = np.sum(np.power(Y_est-Y,2))/(2.*n)
            loss_v.append(loss)
            #calculo gradientes
            gradientes = (-1/n)*(X_c.T.dot((Y - X_c.dot(self.theta))))
            #actualizar
            self.theta = self.theta - lr*gradientes

            # Esto se agrega para parar el entrenamiento en caso de que la perdida no disminuya por mas del valor del parametro patience
            if loss < best_loss:
                best_loss = loss
                best_theta = np.copy(self.theta)
                epochs_stall = 0
            else:
                epochs_stall+=1
            if epochs_stall>=patience:
                print('La funcion de perdida no ha disminuido, parando despues de {} epocas. el error es: {}'.format(epoch, loss))
                break
            print('Epoch: {} Loss: {:.4e}'.format(epoch, loss))

        print('El error fue: {:.4e}'.format(loss))
        self.theta = best_theta
        self.loss_vector = loss_v
    def predict(self, X):
        Y_hat = X.dot(self.theta)
        return Y_hat


In [57]:
LR = linear_regressor()
X = train_df.drop(['charges'], axis=1).values
Y = train_df['charges'].values
LR.fit_model(X, Y)

Epoch: 0 Loss: 1.4972e+11
Epoch: 1 Loss: 1.4239e+11
Epoch: 2 Loss: 1.3544e+11
Epoch: 3 Loss: 1.2885e+11
Epoch: 4 Loss: 1.2260e+11
Epoch: 5 Loss: 1.1667e+11
Epoch: 6 Loss: 1.1105e+11
Epoch: 7 Loss: 1.0573e+11
Epoch: 8 Loss: 1.0068e+11
Epoch: 9 Loss: 9.5886e+10
Epoch: 10 Loss: 9.1345e+10
Epoch: 11 Loss: 8.7039e+10
Epoch: 12 Loss: 8.2956e+10
Epoch: 13 Loss: 7.9085e+10
Epoch: 14 Loss: 7.5415e+10
Epoch: 15 Loss: 7.1934e+10
Epoch: 16 Loss: 6.8634e+10
Epoch: 17 Loss: 6.5505e+10
Epoch: 18 Loss: 6.2538e+10
Epoch: 19 Loss: 5.9724e+10
Epoch: 20 Loss: 5.7056e+10
Epoch: 21 Loss: 5.4527e+10
Epoch: 22 Loss: 5.2128e+10
Epoch: 23 Loss: 4.9853e+10
Epoch: 24 Loss: 4.7696e+10
Epoch: 25 Loss: 4.5651e+10
Epoch: 26 Loss: 4.3712e+10
Epoch: 27 Loss: 4.1872e+10
Epoch: 28 Loss: 4.0128e+10
Epoch: 29 Loss: 3.8474e+10
Epoch: 30 Loss: 3.6906e+10
Epoch: 31 Loss: 3.5419e+10
Epoch: 32 Loss: 3.4008e+10
Epoch: 33 Loss: 3.2671e+10
Epoch: 34 Loss: 3.1402e+10
Epoch: 35 Loss: 3.0199e+10
Epoch: 36 Loss: 2.9059e+10
Epoch: 37 L

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
992,50,1,31.600,2,0,0,0,0,1
937,39,1,24.225,5,0,0,1,0,0
688,47,1,24.100,1,0,0,0,0,1
1185,45,0,23.560,2,0,1,0,0,0
1137,26,1,22.230,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
799,33,0,24.795,0,1,1,0,0,0
806,40,1,41.420,1,0,0,1,0,0
241,33,1,22.135,1,0,1,0,0,0
829,39,0,21.850,1,0,0,1,0,0
