In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MinMaxScaler, StandardScaler, PolynomialFeatures, Binarizer, KBinsDiscretizer, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

In [56]:
data = pd.read_csv('dataset.csv')

In [57]:
bins = [0,18,25,30,35,40,50,60]
labels=['sous poids','poids normal','surpoids','obésité modérée','obésité sévère','obésité morbide','obésité massive']

data['BMI_cat']=pd.cut(data['bmi'], bins=bins, labels=labels, right=False)

data[data['bmi']>50]

data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,BMI_cat
0,19,female,27.9,0,yes,southwest,16884.924,surpoids
1,18,male,33.77,1,no,southeast,1725.5523,obésité modérée
2,28,male,33.0,3,no,southeast,4449.462,obésité modérée
3,33,male,22.705,0,no,northwest,21984.47061,poids normal
4,32,male,28.88,0,no,northwest,3866.8552,surpoids


In [58]:


X= data.drop(['charges','bmi'], axis=1)
y=data['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])

In [74]:
preprocessor = make_pipeline(make_column_transformer((StandardScaler(), ['children','age']),
                                                     (OrdinalEncoder(), ['smoker', 'sex']), (OneHotEncoder(),['region',"BMI_cat"])), PolynomialFeatures(2))

In [99]:
#Linear Regression

model = make_pipeline(preprocessor, LinearRegression())

param_grid = {
    'pipeline__polynomialfeatures__degree': [1,2,3]
}

grid_search = GridSearchCV(
    model,
    param_grid,
    cv=5,
    scoring='r2'
)


grid_search.fit(X_train, y_train)

print(grid_search.score(X_test,y_test))
print(grid_search.best_params_)


# print("Train score : ", model.score(X_test,y_test))


0.8713098897853842
{'pipeline__polynomialfeatures__degree': 2}


In [109]:
Lasso().get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [102]:
#Lasso

lasso_model = make_pipeline(preprocessor, Lasso())

param_grid = {
    'lasso__alpha': np.linspace(1,100,50)
}

grid_search = GridSearchCV(
    lasso_model,
    param_grid,
    cv=5,
    scoring='r2'
)


grid_search.fit(X_train,y_train)
grid_search.best_params_
best_lasso_model=lasso_model.set_params(**grid_search.best_params_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [103]:


best_lasso_model.fit(X_train, y_train)

print("Train score : ", best_lasso_model.score(X_test,y_test))

Train score :  0.8742126856055694


In [64]:
#Ridge

ridge_model = make_pipeline(preprocessor, Ridge(random_state=42))

param_grid = {
    'ridge__alpha': np.linspace(1,100,100)
}

grid_search = GridSearchCV(
    ridge_model,
    param_grid,
    cv=5,
    scoring='r2'
)


grid_search.fit(X_train,y_train)
grid_search.best_params_

{'ridge__alpha': np.float64(43.0)}

In [70]:
ridge_model = make_pipeline(preprocessor, Ridge(43,random_state=42))

ridge_model.fit(X_train, y_train)

print("Train score : ", ridge_model.score(X_test,y_test))

Train score :  0.8424200603918353
