In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [4]:
# Creación de objeto pandas datafr-ame
patients_df=pd.read_csv('https://github.com/stedy/Machine-Learning-with-R-datasets/blob/master/insurance.csv?raw=true')
patients_df.to_csv("./data.csv")

In [5]:
# Creación variables binarias
patients_df.replace({'sex':{'male':0,'female':1}}, inplace=True)
patients_df.replace({'smoker':{'yes':1,'no':0}}, inplace=True)

In [6]:
# La función get dummies convierte un DataFrame de columnas categoricas a uno con variables dummy variables
region_dummies_df=pd.get_dummies(patients_df[['region']])
region_dummies_df = region_dummies_df.replace().astype(int)
region_dummies_df

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [7]:
# Hacemos join entre los 2 dataframes para reconstruir el dataset
patients_df = patients_df.join(region_dummies_df)

In [8]:
patients_df = patients_df.drop(['region'], axis=1)
patients_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27.900,0,1,16884.92400,0,0,0,1
1,18,0,33.770,1,0,1725.55230,0,0,1,0
2,28,0,33.000,3,0,4449.46200,0,0,1,0
3,33,0,22.705,0,0,21984.47061,0,1,0,0
4,32,0,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,10600.54830,0,1,0,0
1334,18,1,31.920,0,0,2205.98080,1,0,0,0
1335,18,1,36.850,0,0,1629.83350,0,0,1,0
1336,21,1,25.800,0,0,2007.94500,0,0,0,1


In [9]:
# Uso 70% para entrenamiento (random split)
train_df= patients_df.sample(frac=0.7,random_state=200)
rest_df = patients_df.drop(train_df.index)
# Uso 15% para validacion y 15% para test
val_df=rest_df.sample(frac=0.5,random_state=200)
test_df=rest_df.drop(val_df.index)

In [2]:
class linear_regressor():
    def __init__(self):
        self.theta = None
    def fit_model(self, X, Y, lr=0.00001, epochs=100, patience=10): 
        """Implementación de la función de entrenamiento por descendo de gracdiente

        Args:
            X (np array): vector de características nxm
            Y (np array): vector de variable objetivo
        """

        #Generacion de los thetas aleatorios
        n, m = X.shape
        self.theta = np.random.rand(m+1,1)
        #Se agrega la dimension
        X_c = np.hstack((np.ones((n,1)),X))
        loss_v = []
        best_loss = np.inf

        for epoch in range(epochs):
            Y_est = X_c.dot(self.theta)
            #Calcular la perdidad
            loss = np.sum(np.power(Y_est-Y,2))/(2.*n)
            loss_v.append(loss)
            #calculo gradientes
            gradientes = (-1/n)*(X_c.T.dot((Y - X_c.dot(self.theta))))
            #actualizar
            self.theta = self.theta - lr*gradientes

            # Esto se agrega para parar el entrenamiento en caso de que la perdida no disminuya por mas del valor del parametro patience
            if loss < best_loss:
                best_loss = loss
                best_theta = np.copy(self.theta)
                epochs_stall = 0
            else:
                epochs_stall+=1
            if epochs_stall>=patience:
                print('La funcion de perdida no ha disminuido, parando despues de {} epocas. el error es: {}'.format(epoch, loss))
                break
            print('Epoch: {} Loss: {:.4e}'.format(epoch, loss))

        print('El error fue: {:.4e}'.format(loss))
        self.theta = best_theta
        self.loss_vector = loss_v
    def predict(self, X):
        Y_hat = X.dot(self.theta)
        return Y_hat


In [14]:

# Supongamos que todas las columnas excepto "cost" son características
X = patients_df.drop("charges", axis=1).values
y = patients_df['charges'].values

def generate_polynomial_features(X, degree=2):
    m, n = X.shape
    if degree == 1:
        return X
    else:
        features = [X]
        for deg in range(2, degree+1):
            for i in range(n):
                features.append(np.power(X[:, i], deg)[:, np.newaxis])
                for j in range(i+1, n):
                    features.append((X[:, i] * X[:, j])[:, np.newaxis])
        return np.hstack(features)

# Generar características polinómicas y de interacción
degree = 2
X_poly = generate_polynomial_features(X, degree)

# Transformaciones: aplicar logaritmo
# Evitamos logaritmos de cero o números negativos con np.clip
X_log = np.log(np.clip(X, a_min=1e-10, a_max=None))

# Concatenar características polinómicas y logarítmicas
X_transformed = np.hstack([X_poly, X_log])

class linear_regressor():
    def __init__(self):
        self.theta = None

    def fit_model(self, X, Y, lr=0.00001, epochs=100, patience=10, lambda_=0.01):
        n, m = X.shape
        self.theta = np.random.rand(m+1,1)
        X_c = np.hstack((np.ones((n,1)),X))
        loss_v = []
        best_loss = np.inf
        
        for epoch in range(epochs):
            Y_est = X_c.dot(self.theta)
            loss = np.sum(np.power(Y_est-Y,2))/(2.*n) + lambda_ * np.sum(np.square(self.theta[1:])) / 2.  # Added L2 regularization
            loss_v.append(loss)
            gradientes = (-1/n)*(X_c.T.dot((Y - X_c.dot(self.theta)))) + lambda_ * np.vstack([np.zeros((1,1)), self.theta[1:]])
            self.theta = self.theta - lr*gradientes
            
            if loss < best_loss:
                best_loss = loss
                best_theta = np.copy(self.theta)
                epochs_stall = 0
            else:
                epochs_stall+=1
                
            if epochs_stall>=patience:
                print('La funcion de perdida no ha disminuido, parando despues de {} epocas. el error es: {}'.format(epoch, loss))
                break
            print('Epoch: {} Loss: {:.4e}'.format(epoch, loss))
        
        print('El error fue: {:.4e}'.format(loss))
        self.theta = best_theta
        self.loss_vector = loss_v

    def predict(self, X):
        X_c = np.hstack((np.ones((X.shape[0],1)),X))
        Y_hat = X_c.dot(self.theta)
        return Y_hat

# Crear y entrenar el modelo
model = linear_regressor()
model.fit_model(X_transformed, y, lr=0.01, epochs=1000, patience=10, lambda_=0.01)

# Realizar predicciones con datos nuevos
# Tomar un solo ejemplo
new_data = patients_df.iloc[0, :-1].values.reshape(1, -1)

# Generar características polinómicas para ese ejemplo
new_data_poly = generate_polynomial_features(new_data, degree)

# Generar log transform para ese ejemplo
new_data_log = np.log(np.clip(new_data, a_min=1e-10, a_max=None))

# Concatenar las características transformadas
new_data_transformed = np.hstack([new_data_poly, new_data_log])

# Hacer la predicción
prediction = model.predict(new_data_transformed)
print(prediction)

Epoch: 0 Loss: 1.7334e+11


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1 and the array at index 1 has size 1338

In [3]:
LR = linear_regressor()
X = train_df.drop(['charges'], axis=1).values
Y = train_df['charges'].values
LR.fit_model(X, Y)

NameError: name 'train_df' is not defined

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
992,50,1,31.600,2,0,0,0,0,1
937,39,1,24.225,5,0,0,1,0,0
688,47,1,24.100,1,0,0,0,0,1
1185,45,0,23.560,2,0,1,0,0,0
1137,26,1,22.230,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
799,33,0,24.795,0,1,1,0,0,0
806,40,1,41.420,1,0,0,1,0,0
241,33,1,22.135,1,0,1,0,0,0
829,39,0,21.850,1,0,0,1,0,0
