In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

## Data loading

In [3]:
data = pd.read_csv("krankenkasse-mit-pbf.csv", sep = ";", names = ['age', 'sex', 'bmi', 'children', 'smoker', 'canton', "pbf", 'charges'])
data = data.iloc[1:]

data['charges'] = data['charges'].str.replace("’", "").apply(pd.to_numeric)
data['pbf'] = data['pbf'].apply(pd.to_numeric)
data['bmi'] = data['bmi'].apply(pd.to_numeric)
data['age'] = data['age'].apply(pd.to_numeric)
data['children'] = data['children'].apply(pd.to_numeric)

# Encode categorical variables.
data["sex"] = pd.factorize(data['sex'])[0]
data["smoker"] = pd.factorize(data['smoker'])[0]

data['canton'] = pd.factorize(data['canton'])[0]

# Remove encoded categorical variables.

# 1 record with negative percentage of body fat.
data = data[data["pbf"]>0]

# Divide the charges into categories. Is there a more objective way to determine the categories? K-clustering?
bins = [0, 5000, 10000, 20000, 100000]
bin_labels = [0, 1, 2, 3]
data["charges_level"] = pd.cut(data["charges"], bins=bins, labels=bin_labels, include_lowest=True)

data.head()



data.sort_values(by='charges', ascending=False, na_position='first')

Unnamed: 0,age,sex,bmi,children,smoker,canton,pbf,charges,charges_level
544,54,0,40.328,0,0,1,65.78,59703,3
902,60,1,35.136,0,0,1,37.99,43749,3
282,54,1,34.852,3,0,3,37.43,42668,3
531,57,1,36.104,1,0,1,39.30,40939,3
40,60,1,34.320,0,0,0,36.26,40748,3
1063,59,1,35.312,1,0,1,37.86,39746,3
570,48,1,34.852,2,0,2,37.15,39266,3
861,37,0,40.480,2,0,0,66.15,39136,3
1123,53,0,31.888,3,0,2,50.79,38475,3
952,51,1,36.720,2,0,1,40.61,38105,3


## Implementation of the ridge regression functions

In [4]:
from sklearn.base import BaseEstimator

class RidgeRegression(BaseEstimator):
    
    def __init__(self, opt_method='gd', alpha=1., eta=0.01, maxsteps=100, eps=0.00000001):
        '''Implements a Ridge Regression estimator.
        
        Arguments
        ---------
        alpha:      Regularization proportionality factor. Larger values
                    correspond with stronger regularization.
        opt_method: Optimization method to choose for the cost function.
                    Can be either 'gd' (Gradient Descent) or 'neq'.
        maxsteps:   Maximum number of Gradient Descent steps to take.
        eps:        Epsilon, length of gradient to be reached with Gradient
                    Descent.
        eta:        Fixed step lenght to take at each gradient descent
                    iteration.
        '''
        # parameters
        self.alpha = alpha
        self.opt_method = opt_method
        self.maxsteps = maxsteps
        self.eps = eps
        self.eta = eta
        # attributes
        # model coefficients
        self.beta_ = None
        # values of cost function along gradient descent iterations
        self.costs_ = []       
        
    def fit(self,X,y):
        if(self.opt_method == "neq"):
            type(X)
            return(self.normalequation(X,y))
        else:
            raise Exception("No available optimization method was chosen.")
            
    def normalequation(self,X,y):
        '''Computes the coefficients of the ridge regression cost function
        using the normalequation.
        '''
        
        XX = np.zeros((len(X),X.shape[1]+1))
        XX[:,0] = np.ones(len(X))

        XX[:,1:] = X
        
        #identity matrix with full range
        identity = np.identity(XX.shape[1])
        identity[0,:] = np.zeros(identity.shape[0])
        
        self.beta_ = np.linalg.inv(XX.T @ XX + self.alpha * identity) @ XX.T @ y
        
        return(self.beta_)
    
    @staticmethod 
    def costfunction(beta,X,y,alpha):
        '''Computes and returns the value of the ridge regression cost function.
        '''
        
        cost = np.sum((y - beta[0] + np.dot(X, beta[1:])**2) + alpha * np.sum(beta.T @ beta))
        
        return cost
    
    def predict(self,X):
        '''Computes the predictions of the current model.
            takes in a Xx8 matrix and spits out its respective prediction
        '''
        
        if(self.beta_ is not None):
            print(self.beta_.shape)
            predictions = self.beta_[0] + np.dot(X, self.beta_[1:])

            return predictions
        
        raise Exception("Model coefficient haven't been calculated yet. Please call the specific functions to calculate the coefficients.")
    
    def score(self,X,y):
        '''Returns R^2 for given input/output data given the model
        coefficients. 1 - (sum of squares of residuals / total sum of squares) | sum of squares = sum of (yi - mean(y))^2
        '''
        
        fi = self.predict(X)
        
        ss_res = np.sum((y - fi)**2)
                        
        ss_tot = np.sum((y-np.mean(y))**2)
        
        #print(ss_res, ss_tot)
        
        score = 1-(ss_res/ss_tot)
        
        return score

In [5]:
from sklearn.model_selection import train_test_split

labels = np.array(data['charges'])
# The data we want to split = 'input'
input = data.drop('charges', axis=1).drop('charges_level', axis=1)

# The argument for the split function has to be an array
input_list = list(input.columns)
input = np.array(input)

train_input, test_input, train_labels, test_labels = \
    train_test_split(input, labels, test_size = 0.2, random_state = 42) 

In [6]:
##instantiate regressor
RidgeRegressor = RidgeRegression(opt_method='neq', alpha = 10)

ridge1 = RidgeRegressor.fit(train_input, train_labels) 

ridge1

array([-8640.98653598,   155.4508523 ,  2551.42848186,   244.00368619,
         151.13324312, -6968.88739717,   489.55537718,   187.66108318])

In [7]:
#predict function
test = train_input[10:16:]
print("Input Data: ", test)
print("Real Charges: ", train_labels[10:16:])
predics = RidgeRegressor.predict(test)
print("Predicted Charges: ", predics)

Input Data:  [[28.     0.    16.232  0.     1.     3.    27.37 ]
 [19.     0.    28.088  0.     1.     2.    44.53 ]
 [26.     1.    24.212  3.     1.     3.    18.54 ]
 [31.     0.    28.62   2.     1.     2.    42.88 ]
 [38.     0.    32.584  0.     1.     1.    51.18 ]
 [25.     1.    23.376  0.     1.     3.    15.62 ]]
Real Charges:  [4045 3904 2293 6505 7241 1655]
(8,)
Predicted Charges:  [-691.63225637 3532.92658658 2292.39630162 5520.77247423 8341.92417936
  931.58827544]


## TODO?

- Backwards/Forwards Selection
- Metrics
- Check factorization?
- Check outliers
- Logarithms?
- Orthogonalization using PCA?
- Polynomials?
- Se