## Import

In [1]:
import os
import psycopg2
from dotenv import load_dotenv

import pandas as pd
import numpy as np

# preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline


# machine learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge


# metriques 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error

# Monitoring 
import mlflow
import mlflow.sklearn

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline
plt.style.use('seaborn')

# Affichage cellule
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_row', 1000)
from pprint import pprint

# Chargement du model pour déploiement
import pickle

In [2]:


load_dotenv()  # Nécessaire pour charger les variables d'environnement précédemment définies

# Créer une connexion à postgres
connection = psycopg2.connect(host=os.environ.get('PG_HOST'),
                        user=os.environ.get('PG_USER'),
                        password=os.environ.get('PG_PASSWORD'),
                        dbname=os.environ.get('PG_DATABASE'))

connection.autocommit = True  # Assurez-vous que les données sont ajoutées à la base de données immédiatement après les commandes d'écriture.
cursor = connection.cursor()
cursor.execute('SELECT %s as connected;', ('Connection à postgres Réussie!',))
print(cursor.fetchone())

('Connection à postgres Réussie!',)


In [3]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    Transformer une requête SELECT en un dataframe pandas
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturellement, nous obtenons une liste de "tupples".
    tupples = cursor.fetchall()
    cursor.close()
    
    # Nous devons juste le transformer en dataframe pandas.
    df = pd.DataFrame(tupples, columns=column_names)
    return df

### Récupération de postgresql au dataframe

In [4]:
conn = connection
column_names = ["user_id","gender", "age", "height", "weight", "duration", "heart_rate", "body_temp", "calorie"]
# Execute the "SELECT *" query
df_db = postgresql_to_dataframe(conn, 
"SELECT persons.user_id as id, gender, age, height, weight, duration, heart_rate, body_temp,calorie FROM calories INNER JOIN persons ON calories.user_id = persons.user_id"
                                , column_names)
df_db.head()

Unnamed: 0,user_id,gender,age,height,weight,duration,heart_rate,body_temp,calorie
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


## Monitoring par modèles

In [5]:
# # utilisation des doubles crochets pour renvoyer un DataFrame pour la variable  X
X = df_db[['duration']]
# y unidimensionnel
y = df_db['calorie']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [6]:
mlflow.set_tracking_uri('/Users/marinelafargue/Desktop/projet calorie/mlflow/mlruns')

In [7]:
experiment_id = mlflow.create_experiment('Projet_Diet_Simplon_Monitoring')

### 1) model_lineaire

In [8]:
with mlflow.start_run(experiment_id = experiment_id):
    
    fit_intercept=True
    normalize=True
    n_jobs=2

    model_lineaire = LinearRegression(fit_intercept=fit_intercept, normalize=normalize, n_jobs=n_jobs)
    model_lineaire.fit(X_train,y_train)
    model_lineaire.score(X_test,y_test)
    
    yPrediction = model_lineaire.predict(X_test)
    
    # Log model
    mlflow.sklearn.log_model(model_lineaire, "model_lineaire")
    
    MAE = mean_absolute_error(y_test, yPrediction)
    print("MAE: {}". format(MAE))
    RMSE = mean_squared_error(y_test, yPrediction)
    print("Root Mean Squared Error: {}". format(RMSE))
    R2 = r2_score(y_test, yPrediction)
    print("R2: {}". format(R2))
    NMAE = median_absolute_error(y_test, yPrediction)
    print("Median Mean Squared Error: {}". format(NMAE))
    
    mlflow.log_param('fit_intercept', fit_intercept)
    mlflow.log_param('normalize', normalize)
    mlflow.log_param('n_jobs', n_jobs)
    
    mlflow.log_metric('MAE',MAE)
    mlflow.log_metric('RMSE',RMSE)
    mlflow.log_metric('R2',R2)
    mlflow.log_metric('NMAE',NMAE)

MAE: 13.522473647813024
Root Mean Squared Error: 338.17003713643567
R2: 0.9132237752267383
Median Mean Squared Error: 10.384579260526348


### 2) random forest

In [9]:
ohe=OneHotEncoder(sparse=False)
scaler = MinMaxScaler()
ct=make_column_transformer((ohe,['gender']),
                      (scaler,  ['duration','heart_rate','body_temp']),remainder='passthrough')
data_ct=ct.fit_transform(df_db)

# Encodage sur le genre et retransformation en dataframe pandas nommé dum
ohe_True=OneHotEncoder(sparse=True)
ohe_with_sparse=ohe_True.fit_transform(df_db['gender'].values.reshape(-1,1)).toarray()
ohe_with_sparse

dum=pd.get_dummies(df_db,columns=['gender'],prefix='',prefix_sep='')

In [10]:
X = dum[['age', 'weight', 'duration', 'heart_rate','body_temp','female', 'male']]
y = dum.calorie

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [11]:
with mlflow.start_run(experiment_id = experiment_id):

    # Set the model parameters. 
    n_estimators = 100
    max_depth = 6
  
    # Create and train model.
    rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth)
    rf.fit(X_train, y_train)
  
    # Use the model to make predictions on the test dataset.
    yPrediction = rf.predict(X_test)
    
    # Log model
    mlflow.sklearn.log_model(rf, "RandomForestRegressor")
    
    MAE = mean_absolute_error(y_test, yPrediction)
    print("MAE: {}". format(MAE))
    RMSE = mean_squared_error(y_test, yPrediction)
    print("Root Mean Squared Error: {}". format(RMSE))
    R2 = r2_score(y_test, yPrediction)
    print("R2: {}". format(R2))
    NMAE = median_absolute_error(y_test, yPrediction)
    print("Median Mean Squared Error: {}". format(NMAE))
    
    mlflow.log_param('n_estimators', n_estimators)
    mlflow.log_param('max_depth', max_depth)
    
    mlflow.log_metric('MAE',MAE)
    mlflow.log_metric('RMSE',RMSE)
    mlflow.log_metric('R2',R2)
    mlflow.log_metric('NMAE',NMAE)

MAE: 5.459854078144971
Root Mean Squared Error: 63.84568119378923
R2: 0.9838259314902525
Median Mean Squared Error: 3.6715093947254545


### 3) ridge

In [12]:
X = dum[['age', 'weight', 'duration', 'heart_rate','body_temp','female', 'male']]
y = dum.calorie

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [13]:
with mlflow.start_run(experiment_id = experiment_id):
    
    # Set the model parameters. 
    alpha = 0.4
    normalize = True

    ridge = Ridge(alpha = alpha, normalize = normalize)
    # Naive model
    ridge_model = ridge.fit(X_train, y_train)
    ridge_pred = ridge.predict(X_test)
  
    # Log model
    mlflow.sklearn.log_model(ridge_model, "ridge-linear-regression-model")
      
    # Metrics
    MAE = mean_absolute_error(y_test, yPrediction)
    print("MAE: {}". format(MAE))
    RMSE = mean_squared_error(y_test, yPrediction)
    print("Root Mean Squared Error: {}". format(RMSE))
    R2 = r2_score(y_test, yPrediction)
    print("R2: {}". format(R2))
    NMAE = median_absolute_error(y_test, yPrediction)
    print("Median Mean Squared Error: {}". format(NMAE))

    mlflow.log_param('alpha', alpha)
    mlflow.log_param('normalize', normalize)
    
    mlflow.log_metric('MAE',MAE)
    mlflow.log_metric('RMSE',RMSE)
    mlflow.log_metric('R2',R2)
    mlflow.log_metric('NMAE',NMAE)

MAE: 5.459854078144971
Root Mean Squared Error: 63.84568119378923
R2: 0.9838259314902525
Median Mean Squared Error: 3.6715093947254545


### 4) LASSO

In [14]:
X = dum[['age', 'weight', 'duration', 'heart_rate','body_temp','female', 'male']]
y = dum.calorie

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [15]:
with mlflow.start_run(experiment_id = experiment_id):

    # Set the model parameters. 
    alpha=0
  
    # Create and train model.
    ls = linear_model.Lasso(alpha=alpha)
    ls.fit(X_train,y_train)
  
    # Use the model to make predictions on the test dataset.
    yPrediction = rf.predict(X_test)
    
      # Log model
    mlflow.sklearn.log_model(ls, "Lasso-model")
  
    # Metrics
    MAE = mean_absolute_error(y_test, yPrediction)
    print("MAE: {}". format(MAE))
    RMSE = mean_squared_error(y_test, yPrediction)
    print("Root Mean Squared Error: {}". format(RMSE))
    R2 = r2_score(y_test, yPrediction)
    print("R2: {}". format(R2))
    NMAE = median_absolute_error(y_test, yPrediction)
    print("Median Mean Squared Error: {}". format(NMAE))
 
    #log parmas
    mlflow.log_param('alpha', alpha)
  
    # log metrics
    mlflow.log_metric('MAE',MAE)
    mlflow.log_metric('RMSE',RMSE)
    mlflow.log_metric('R2',R2)
    mlflow.log_metric('NMAE',NMAE)

MAE: 5.459854078144971
Root Mean Squared Error: 63.84568119378923
R2: 0.9838259314902525
Median Mean Squared Error: 3.6715093947254545


## Regression linéaire multiple

In [16]:
# utilisation des doubles crochets pour renvoyer un DataFrame pour la variable  X
X = df_db[['age', 'height', 'weight', 'duration', 'heart_rate', 'body_temp']]
# y unidimensionnel
y = df_db['calorie']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [17]:
with mlflow.start_run(experiment_id = experiment_id):
    
    # Set the model parameters. 
    fit_intercept = True
    normalize = True
    
    #regression linéaire avec plusieurs features : 'age', 'height', 'weight', 'duration', 'heart_rate', 'body_temp'
    model_lineaire_multiple = LinearRegression(fit_intercept=fit_intercept,normalize=normalize)
    model_lineaire_multiple.fit(X_train,y_train)

    yPrediction = model_lineaire_multiple.predict(X_test)
    
    # Log model
    mlflow.sklearn.log_model(model_lineaire_multiple, "model_lineaire_multiple")
    
    # Metrics
    MAE = mean_absolute_error(y_test, yPrediction)
    print("MAE: {}". format(MAE))
    RMSE = mean_squared_error(y_test, yPrediction)
    print("Root Mean Squared Error: {}". format(RMSE))
    R2 = r2_score(y_test, yPrediction)
    print("R2: {}". format(R2))
    NMAE = median_absolute_error(y_test, yPrediction)
    print("Median Mean Squared Error: {}". format(NMAE))
    
    #log params
    mlflow.log_param('fit_intercept', fit_intercept)
    mlflow.log_param('normalize', normalize)
    
    #log metrics
    mlflow.log_metric('MAE',MAE)
    mlflow.log_metric('RMSE',RMSE)
    mlflow.log_metric('R2',R2)
    mlflow.log_metric('NMAE',NMAE)
    

MAE: 8.429490621841946
Root Mean Squared Error: 129.02388515509887
R2: 0.9668917868828775
Median Mean Squared Error: 6.515738424173975
