In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
# Creación de objeto pandas datafr-ame
patients_df=pd.read_csv('https://github.com/stedy/Machine-Learning-with-R-datasets/blob/master/insurance.csv?raw=true')
patients_df.to_csv("./data.csv")

In [3]:
# Creación variables binarias
patients_df.replace({'sex':{'male':0,'female':1}}, inplace=True)
patients_df.replace({'smoker':{'yes':1,'no':0}}, inplace=True)

In [4]:
# La función get dummies convierte un DataFrame de columnas categoricas a uno con variables dummy variables
region_dummies_df=pd.get_dummies(patients_df[['region']])
region_dummies_df = region_dummies_df.replace().astype(int)
region_dummies_df

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [5]:
# Hacemos join entre los 2 dataframes para reconstruir el dataset
patients_df = patients_df.join(region_dummies_df)

In [6]:
patients_df = patients_df.drop(['region'], axis=1)
patients_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27.900,0,1,16884.92400,0,0,0,1
1,18,0,33.770,1,0,1725.55230,0,0,1,0
2,28,0,33.000,3,0,4449.46200,0,0,1,0
3,33,0,22.705,0,0,21984.47061,0,1,0,0
4,32,0,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,10600.54830,0,1,0,0
1334,18,1,31.920,0,0,2205.98080,1,0,0,0
1335,18,1,36.850,0,0,1629.83350,0,0,1,0
1336,21,1,25.800,0,0,2007.94500,0,0,0,1


In [7]:
# Uso 70% para entrenamiento (random split)
train_df= patients_df.sample(frac=0.7,random_state=200)
rest_df = patients_df.drop(train_df.index)
# Uso 15% para validacion y 15% para test
val_df=rest_df.sample(frac=0.5,random_state=200)
test_df=rest_df.drop(val_df.index)

In [36]:
class linear_regressor():
#    def __init__(self):
#        self.theta = None
    def fit_model(self, X, Y, lr=0.0001, epochs=100, patience=10): 
        """Implementación de la función de entrenamiento por descendo de gracdiente

        Args:
            X (np array): vector de características nxm
            Y (np array): vector de variable objetivo
        """

        #Generacion de los thetas aleatorios
        n, m = X.shape
        theta = np.random.rand(m+1,1)

        #Se agrega la dimension
        X_c = np.hstack((np.ones((n,1)),X))
        loss_v = []
        best_loss = np.inf

        for epoch in range(epochs):
            Y_est = X_c.dot(theta)
            #Calcular la perdidad
            loss = np.sum(np.power(Y_est-Y,2))/(2.*n)
            loss_v.append(loss)
            #calculo gradientes
            gradientes = (-1/n)*(X_c.T.dot((Y - X_c.dot(theta))))
            #actualizar
            print(theta.shape, (Y-X_c.dot(theta)).shape)

            theta = theta - lr*gradientes

            # Esto se agrega para parar el entrenamiento en caso de que la perdida no disminuya por mas del valor del parametro patience
            if loss < best_loss:
                best_loss = loss
                best_theta = np.copy(theta)
                epochs_stall = 0
            else:
                epochs_stall+=1
            if epochs_stall>=patience:
                print('La funcion de perdida no ha disminuido, parando despues de {} epocas. el error es: {}'.format(epoch, loss))
                break
            print('Epoch: {} Loss: {:.4e}'.format(epoch, loss))

        print('El error fue: {:.4e}'.format(loss))
        self.theta = np.copy(best_theta)
        self.loss_vector = loss_v
    def predict(self, X):
        X_c = np.hstack((np.ones((X.shape[0],1)),X))
        print(X_c.shape, self.theta.shape)
        Y_hat = X_c.dot(self.theta)
        return Y_hat


In [37]:
LR = linear_regressor()
X = train_df.drop(['charges'], axis=1).values
Y = train_df['charges'].values
x_test = test_df.drop(['charges'], axis=1).values
y_test = test_df['charges'].values
LR.fit_model(X, Y)

(10, 1) (937, 937)
Epoch: 0 Loss: 1.4961e+11
(10, 937) (937, 937)
Epoch: 1 Loss: 8.5120e+10
(10, 937) (937, 937)
Epoch: 2 Loss: 5.0055e+10
(10, 937) (937, 937)
Epoch: 3 Loss: 3.0976e+10
(10, 937) (937, 937)
Epoch: 4 Loss: 2.0583e+10
(10, 937) (937, 937)
Epoch: 5 Loss: 1.4908e+10
(10, 937) (937, 937)
Epoch: 6 Loss: 1.1797e+10
(10, 937) (937, 937)
Epoch: 7 Loss: 1.0079e+10
(10, 937) (937, 937)
Epoch: 8 Loss: 9.1186e+09
(10, 937) (937, 937)
Epoch: 9 Loss: 8.5700e+09
(10, 937) (937, 937)
Epoch: 10 Loss: 8.2457e+09
(10, 937) (937, 937)
Epoch: 11 Loss: 8.0436e+09
(10, 937) (937, 937)
Epoch: 12 Loss: 7.9085e+09
(10, 937) (937, 937)
Epoch: 13 Loss: 7.8101e+09
(10, 937) (937, 937)
Epoch: 14 Loss: 7.7322e+09
(10, 937) (937, 937)
Epoch: 15 Loss: 7.6658e+09
(10, 937) (937, 937)
Epoch: 16 Loss: 7.6061e+09
(10, 937) (937, 937)
Epoch: 17 Loss: 7.5504e+09
(10, 937) (937, 937)
Epoch: 18 Loss: 7.4974e+09
(10, 937) (937, 937)
Epoch: 19 Loss: 7.4462e+09
(10, 937) (937, 937)
Epoch: 20 Loss: 7.3964e+09
(10,

In [108]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LR1 = LinearRegression()
LR1.fit(X,Y)
# Predicting on the test set
y_pred = LR.predict(x_test)

# Computing the MSE
mse = np.sum(np.power(y_pred-y_test,2))/(2.*n)
print("El error en test es de {:.4}".format(mse))

(201, 10) (10, 937)


ValueError: operands could not be broadcast together with shapes (201,937) (201,) 

In [85]:
y_pred.shape


(201, 937)

In [89]:
y_test.shape

(201,)