In [5]:
# General
import pandas as pd
import numpy as np
from math import sqrt
from sqlalchemy import create_engine

# Machine learning
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import precision_recall_fscore_support, f1_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import SGDRegressor, LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn import svm, tree
from sklearn.pipeline import Pipeline, make_pipeline


from interpret.glassbox import ExplainableBoostingRegressor, LogisticRegression, ClassificationTree, DecisionListClassifier

# Connection to db
engine = create_engine('postgresql://postgres:password@localhost/cibnav')

ModuleNotFoundError: No module named 'psycopg2'

## Creating the dataset

In [None]:
def load_history(engine, history=3):
    params = ['id_nav_flotteur', 'annee', 'prescriptions', 'prescriptions_majeurs',
              'at', 'genre_navigation', 'longueur_hors_tout', 'sitrep', 'sitrep_cible', ]
    return pd.read_sql("select * from dataset_{}".format(history), engine)

In [None]:
def create_target(df):
    df['cible'] = df['sitrep_cible']
    y = df['cible']

    col_genre_nav = [col for col in df if col.startswith('genre_nav')]

    df = df.fillna(df.mean())

    return df[['annee', 'prescriptions', 'prescriptions_majeurs', 'at', 'sitrep', 'effectif_minimum', 'longueur_hors_tout'] + col_genre_nav], y

In [None]:
df = load_history(engine, )
display(df.isna().sum())
display(df.describe())
df, y = create_target(df)
display(df.isna().sum())
display(df.describe())

## Choosing a ml model

In [None]:
models = [
    {'name': 'dummy', 'function': DummyRegressor(strategy='mean')},
    {'name': 'linear_regressor', 'function': LinearRegression()},
    {'name': 'decision_tree', 'function': tree.DecisionTreeRegressor()},
    {'name': 'decision_tree_optimum', 'function': tree.DecisionTreeRegressor(
        criterion='friedman_mse',  max_depth=10, splitter='best')},
    {'name': 'lasso', 'function': Lasso()},
    {'name': 'elastic_net', 'function': ElasticNet()},
    {'name': 'ridge', 'function': Ridge()},
    {'name': 'sgd_regressor', 'function': SGDRegressor(
        loss='huber', penalty='elasticnet', alpha=0.1)},
    {'name': 'gbr', 'function': GradientBoostingRegressor()},
    {'name': 'ebm', 'function': ExplainableBoostingRegressor()}
]

In [None]:
for i, model in enumerate(models):
    scores = cross_val_score(
        model['function'], df, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=7)
    model['cross_val_score'] = scores.mean()

In [None]:
model = tree.DecisionTreeRegressor()
m = model.fit(df, y)
display(m.get_depth())
display(models[3])
# display(models[2]['function'].get_depth())

We choose SGD Regressor

## Choose the parameters of the model

In [3]:
param_grid = {
    'alpha': 10.0**-np.arange(1, 7),
    'loss': ["epsilon_insensitive", "huber", "squared_loss"],
    'penalty': ['elasticnet'],
}

In [4]:
classifier_pipeline = make_pipeline(StandardScaler(), SGDRegressor(max_iter=np.ceil(10**6/len(df))))

NameError: name 'make_pipeline' is not defined

In [None]:
param_grid = {
    'decsT__criterion': ["mse", "friedman_mse"],
    'decsT__max_depth': [5, 10, None],
    'decsT__splitter': ["best"]
}

In [None]:
#classifier_pipeline = make_pipeline(StandardScaler(), tree.DecisionTreeRegressor())
pipe = Pipeline([('scaling', StandardScaler()),
                 ('decsT', tree.DecisionTreeRegressor())])

In [None]:
clf = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True, n_jobs=7)
grid_result = clf.fit(df, y)


Voici notre modèle avec les paramètres optimisés.

In [None]:
print("Best parameter (CV score=%0.3f):" % grid_result.best_score_)
print(grid_result.best_params_)

In [None]:
y_pred = clf.predict()

## Distribution des prévisions

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, )
y_pred = pd.DataFrame(grid_result.predict(X_train), columns=['pred'])
list_pred = grid_result.predict(X_train)

In [None]:
list_pred[list_pred > 1.5] = 1.5
#.boxplot(list_pred, )

import plotly.express as px
fig = px.box(y_pred[y_pred <  1.5], y='pred' )
fig.show()


In [None]:
scaler = StandardScaler()
scaler.fit(y_pred)
scaled = scaler.transform(y_pred)
scaled = [item for sublist in scaled for item in sublist]


In [None]:
y_pred = y_pred[0].tolist()