# Random forest

In [1]:
# DB
import psycopg
import psycopg2

# Data
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline
from yellowbrick.model_selection import learning_curve

import mlflow
import mlflow.sklearn

# Machine learning
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.model_selection import validation_curve

# Affichage cellule
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_row', 1000)
from pprint import pprint

# Chargement du model pour déploiement
import pickle
import joblib



In [2]:
#pip install mlflow

In [3]:
!mlflow --version

mlflow, version 1.28.0


In [4]:
mlflow.set_tracking_uri("file:///Users/marinelafargue/Desktop/projet calorie/mlruns")

In [15]:
experiment_id = mlflow.create_experiment("training experiment")

In [16]:
mlflow.autolog(log_models=False, exclusive=True)

2022/08/26 17:00:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/08/26 17:00:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


## Data Engineering

### Exploration et validation

Compréhension du profilage des données pour obtenir des informations sur le contenu 
et la structure des données

- **Target** : calorie
- **ligne et colonne** : 15000 lignes et 9 colonnes
- **type de variable** : Numérique et catégorique (1)
- **valeur manquante** : Aucune

### Création d'une fonction pour se connecter a la DB via psycopg2

In [8]:
param_dic = {
    "host"      : "localhost",
    "database"  : "diet",
    "user"      : "clement",
    "password"  : "password"
}
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

### Fonction pour transformer la DB en Dataframe


In [9]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # We just need to turn it into a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df

In [10]:
# Connect to the database
conn = connect(param_dic)
column_names = ["user_id","gender", "age", "height", "weight", "duration", "heart_rate", "body_temp", "calorie"]
# Execute the "SELECT *" query
df_db = postgresql_to_dataframe(conn, 
"SELECT persons.user_id as id, gender, age, height, weight, duration, heart_rate, body_temp,calorie FROM calories INNER JOIN persons ON calories.user_id = persons.user_id"
                                , column_names)
df_db.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,user_id,gender,age,height,weight,duration,heart_rate,body_temp,calorie
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


### Exploration et validation

In [None]:
# A REMPLIR

In [None]:
#sns.pairplot(df_db)



### Entrainement du model et visualisation de la prédiction



In [11]:
x = df_db.drop(columns=['user_id','calorie'])
y = df_db['calorie']

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=10)

le = LabelEncoder()
X_train['gender'] = le.fit_transform(X_train['gender'])
X_test['gender'] = le.fit_transform(X_test['gender'])

# Random forest

In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]
criterion = ['squared_error', 'absolute_error', 'poisson']

In [17]:

with mlflow.start_run(experiment_id=experiment_id) as run:
    rf = RandomForestRegressor()
    rf.fit(X_train,y_train)
    ypred = rf.predict(X_test)
    
    #accuracy = accuracy_score(rf, X_test, y_test)
    
    #mlflow.log_metric('accuracy', accuracy)
    mlflow.log_param('n_estimators', n_estimators)
    mlflow.log_param('max_features', max_features)
    mlflow.log_param('max_depth', max_depth)
    mlflow.log_param('min_samples_split', min_samples_split)
    mlflow.log_param('min_samples_leaf', min_samples_leaf)
    mlflow.log_param('bootstrap', bootstrap)
    mlflow.log_param('criterion', criterion)
    
    #rf.score(X_test,y_test)
    
    #mlflow.sklearn.log_model("rf", rf)
    

In [18]:
mlflow.sklearn.log_model(rf, "rf_BBBB_models")

In [29]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [30]:
rf.fit(X_train, y_train)
base_accuracy = evaluate(rf, X_test, y_test)

2022/08/26 15:09:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1d3c96252bd2453eb7244b19c4818572', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Model Performance
Average Error: 1.8478 degrees.
Accuracy = 97.31%.


In [31]:
# Regardez les paramètres utilisés par mon random forest
print('Parametre utilisés:\n')
pprint(rf.get_params())

Parametre utilisés:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


# Random forest avec random search

In [13]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]
criterion = ['squared_error', 'absolute_error', 'poisson']

In [14]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion
}
pprint(random_grid)

{'bootstrap': [True],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000]}


In [None]:
# random grid pour trouver les meilleurs hyperparameters
# je créé le model
rf = RandomForestRegressor()
# 3 validation croisé
# recherche parmi 10 combinaisons différentes, et utilisation de tous les cœurs disponibles
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit le model  random 
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
#model_lineaire.fit(X_train,y_train)
#model_lineaire.score(X_test,y_test)

In [12]:
base_model = RandomForestRegressor()
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 1.8595 degrees.
Accuracy = 97.27%.


In [None]:
#best_random = rf_random.best_estimator_
#random_accuracy = evaluate(best_random, X_test, y_test)

In [None]:
#print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

# Random forest avec grid search

In [32]:
param_grid = {'n_estimators': [100],
 'min_samples_split': [2],
 'min_samples_leaf': [1],
 'max_features': ['auto'],
 'max_depth': [100],
 'bootstrap': [True],              
             }

In [33]:
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [34]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

2022/08/26 15:09:57 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '551d12c87945421aaf38ca07a49eecdb', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 3 folds for each of 1 candidates, totalling 3 fits


2022/08/26 15:10:19 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [100],
                         'max_features': ['auto'], 'min_samples_leaf': [1],
                         'min_samples_split': [2], 'n_estimators': [100]},
             verbose=2)

In [35]:
grid_search.score(X_test,y_test)

0.9977762370850723

In [36]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [37]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test,y_test)

Model Performance
Average Error: 1.8441 degrees.
Accuracy = 97.31%.


In [21]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Improvement of -0.01%.


## chargement du model

In [38]:
#with open('model_pkl', 'wb') as files:
#    pickle.dump(grid_search, files)