In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
# Creación de objeto pandas datafr-ame
patients_df=pd.read_csv('https://github.com/stedy/Machine-Learning-with-R-datasets/blob/master/insurance.csv?raw=true')
patients_df.to_csv("./data.csv")

In [3]:
# Creación variables binarias
patients_df.replace({'sex':{'male':0,'female':1}}, inplace=True)
patients_df.replace({'smoker':{'yes':1,'no':0}}, inplace=True)

In [4]:
# La función get dummies convierte un DataFrame de columnas categoricas a uno con variables dummy variables
region_dummies_df=pd.get_dummies(patients_df[['region']])
region_dummies_df = region_dummies_df.replace().astype(int)
region_dummies_df

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [5]:
# Hacemos join entre los 2 dataframes para reconstruir el dataset
patients_df = patients_df.join(region_dummies_df)

In [6]:
patients_df = patients_df.drop(['region'], axis=1)
patients_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27.900,0,1,16884.92400,0,0,0,1
1,18,0,33.770,1,0,1725.55230,0,0,1,0
2,28,0,33.000,3,0,4449.46200,0,0,1,0
3,33,0,22.705,0,0,21984.47061,0,1,0,0
4,32,0,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,10600.54830,0,1,0,0
1334,18,1,31.920,0,0,2205.98080,1,0,0,0
1335,18,1,36.850,0,0,1629.83350,0,0,1,0
1336,21,1,25.800,0,0,2007.94500,0,0,0,1


In [7]:
# Uso 70% para entrenamiento (random split)
train_df= patients_df.sample(frac=0.7,random_state=200)
rest_df = patients_df.drop(train_df.index)
# Uso 15% para validacion y 15% para test
val_df=rest_df.sample(frac=0.5,random_state=200)
test_df=rest_df.drop(val_df.index)

In [34]:
class linear_regressor():
    def __init__(self):
        self.theta = None

    def fit_model(self, X, Y, lr=0.00001, epochs=100, patience=10, lambda_=0.01):
        n, m = X.shape
        self.theta = np.random.rand(m+1,1)
        X_c = np.hstack((np.ones((n,1)),X))
        loss_v = []
        best_loss = np.inf
        
        for epoch in range(epochs):
            Y_est = X_c.dot(self.theta)
            loss = np.sum(np.power(Y_est-Y,2))/(2.*n) + lambda_ * np.sum(np.square(self.theta[1:])) / 2.  # Added L2 regularization
            loss_v.append(loss)
            gradientes = (-1/n)*(X_c.T.dot((Y - X_c.dot(self.theta)))) + lambda_ * np.vstack([np.zeros((1,1)), self.theta[1:]])
            self.theta = self.theta - lr*gradientes
            
            if loss < best_loss:
                best_loss = loss
                best_theta = np.copy(self.theta)
                epochs_stall = 0
            else:
                epochs_stall+=1
                
            if epochs_stall>=patience:
                print('La funcion de perdida no ha disminuido, parando despues de {} epocas. el error es: {}'.format(epoch, loss))
                break
            print('Epoch: {} Loss: {:.4e}'.format(epoch, loss))
        
        print('El error fue: {:.4e}'.format(loss))
        self.theta = best_theta
        self.loss_vector = loss_v

    def predict(self, X):
        X_c = np.hstack((np.ones((X.shape[0],1)),X))
        Y_hat = X_c.dot(self.theta)
        return Y_hat

In [39]:
LR = linear_regressor()
X = train_df.drop(['charges'], axis=1).values
Y = train_df[['charges']].values
x_test = test_df.drop(['charges'], axis=1).values
y_test = test_df['charges'].values
LR.fit_model(X, Y)

Epoch: 0 Loss: 1.5957e+08
Epoch: 1 Loss: 1.5463e+08
Epoch: 2 Loss: 1.4995e+08
Epoch: 3 Loss: 1.4552e+08
Epoch: 4 Loss: 1.4131e+08
Epoch: 5 Loss: 1.3732e+08
Epoch: 6 Loss: 1.3354e+08
Epoch: 7 Loss: 1.2996e+08
Epoch: 8 Loss: 1.2656e+08
Epoch: 9 Loss: 1.2334e+08
Epoch: 10 Loss: 1.2028e+08
Epoch: 11 Loss: 1.1739e+08
Epoch: 12 Loss: 1.1464e+08
Epoch: 13 Loss: 1.1204e+08
Epoch: 14 Loss: 1.0957e+08
Epoch: 15 Loss: 1.0723e+08
Epoch: 16 Loss: 1.0501e+08
Epoch: 17 Loss: 1.0290e+08
Epoch: 18 Loss: 1.0091e+08
Epoch: 19 Loss: 9.9018e+07
Epoch: 20 Loss: 9.7225e+07
Epoch: 21 Loss: 9.5525e+07
Epoch: 22 Loss: 9.3914e+07
Epoch: 23 Loss: 9.2386e+07
Epoch: 24 Loss: 9.0937e+07
Epoch: 25 Loss: 8.9563e+07
Epoch: 26 Loss: 8.8261e+07
Epoch: 27 Loss: 8.7026e+07
Epoch: 28 Loss: 8.5855e+07
Epoch: 29 Loss: 8.4745e+07
Epoch: 30 Loss: 8.3693e+07
Epoch: 31 Loss: 8.2695e+07
Epoch: 32 Loss: 8.1749e+07
Epoch: 33 Loss: 8.0853e+07
Epoch: 34 Loss: 8.0002e+07
Epoch: 35 Loss: 7.9196e+07
Epoch: 36 Loss: 7.8432e+07
Epoch: 37 L

In [40]:
LR.theta.shape

(10, 1)

In [42]:

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LR1 = LinearRegression()
LR1.fit(X,Y)
# Predicting on the test set
y_pred = LR.predict(x_test)

# Computing the MSE
mse = np.sum(np.power(y_pred-y_test,2))/(2.*X.shape[0])
print("El error en test es de {:.4}".format(mse))

El error en test es de 2.998e+09


In [85]:
y_pred.shape


(201, 937)

In [89]:
y_test.shape

(201,)