TODO :

1. longueur hors tout
2. scaling

In [1]:
# General
import pandas as pd
import numpy as np
from math import sqrt
from sqlalchemy import create_engine
import psycopg2

# Machine learning
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import precision_recall_fscore_support, f1_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import SGDRegressor, LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn import svm, tree
from sklearn.pipeline import Pipeline, make_pipeline


from interpret.glassbox import ExplainableBoostingRegressor, LogisticRegression, ClassificationTree, DecisionListClassifier

# Connection to db
engine = create_engine('postgresql://postgres:password@localhost/cibnav')


In [2]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [3]:
def load_history(engine, history = 3):
    params = ['id_nav_flotteur', 'annee', 'prescriptions', 'prescriptions_majeurs', 'at', 'genre_navigation', 'longueur_hors_tout', 'sitrep', 'sitrep_cible', ]
    return pd.read_sql("select * from dataset_{}".format(history), engine)

In [4]:
def create_target(df):
    df['cible'] = df['sitrep_cible']
    y = df['cible']
    
    col_genre_nav = [col for col in df if col.startswith('genre_nav')]
    
    df = df.fillna(df.mean())
   
    return df[['annee','prescriptions', 'prescriptions_majeurs', 'at','sitrep', 'effectif_minimum', 'longueur_hors_tout'] + col_genre_nav], y

In [7]:
#onehot = pd.get_dummies(df['genre_navigation'], prefix='genre_nav', prefix_sep='_')
#df = df.join(onehot)
#del df['genre_navigation']

df = load_history(engine, )
display(df.isna().sum())
display(df.describe())
df,y = create_target(df)
display(df.isna().sum())
display(df.describe())


id_nav_flotteur              0
annee                        0
prescriptions                0
prescriptions_majeurs        0
at                           0
genre_navigation          3583
type_essence                 0
at_cible                     0
effectif_minimum         41920
longueur_hors_tout       69771
sitrep                       0
sitrep_cible                 0
type_peche_principal         0
dtype: int64

Unnamed: 0,annee,prescriptions,prescriptions_majeurs,at,genre_navigation,at_cible,effectif_minimum,longueur_hors_tout,sitrep,sitrep_cible
count,71145.0,71145.0,71145.0,71145.0,67562.0,71145.0,29225.0,1374.0,71145.0,71145.0
mean,2016.056645,2.735484,0.463307,0.481243,28.910823,0.079331,1.972489,23.477686,0.240143,0.054607
std,1.999064,5.672171,1.24389,3.795966,6.679995,0.729553,3.390508,37.341954,0.918592,0.3014
min,2013.0,0.0,0.0,0.0,6.0,0.0,0.0,4.35,0.0,0.0
25%,2014.0,0.0,0.0,0.0,27.0,0.0,1.0,9.5,0.0,0.0
50%,2016.0,0.0,0.0,0.0,27.0,0.0,1.0,11.8,0.0,0.0
75%,2018.0,3.0,0.0,0.0,33.0,0.0,2.0,20.33,0.0,0.0
max,2019.0,76.0,18.0,187.0,41.0,36.0,178.0,333.0,30.0,11.0


annee                    0
prescriptions            0
prescriptions_majeurs    0
at                       0
sitrep                   0
effectif_minimum         0
longueur_hors_tout       0
genre_navigation         0
dtype: int64

Unnamed: 0,annee,prescriptions,prescriptions_majeurs,at,sitrep,effectif_minimum,longueur_hors_tout,genre_navigation
count,71145.0,71145.0,71145.0,71145.0,71145.0,71145.0,71145.0,71145.0
mean,2016.056645,2.735484,0.463307,0.481243,0.240143,1.972489,23.477686,28.910823
std,1.999064,5.672171,1.24389,3.795966,0.918592,2.173029,5.18756,6.509611
min,2013.0,0.0,0.0,0.0,0.0,0.0,4.35,6.0
25%,2014.0,0.0,0.0,0.0,0.0,1.0,23.477686,27.0
50%,2016.0,0.0,0.0,0.0,0.0,1.972489,23.477686,27.0
75%,2018.0,3.0,0.0,0.0,0.0,1.972489,23.477686,29.0
max,2019.0,76.0,18.0,187.0,30.0,178.0,333.0,41.0


In [8]:
models = [
    {'name': 'dummy', 'function': DummyRegressor(strategy='mean')},
    {'name': 'linear_regressor', 'function': LinearRegression()},
    {'name': 'decision_tree', 'function': tree.DecisionTreeRegressor()},
    {'name': 'lasso', 'function': Lasso()},
    {'name': 'elastic_net', 'function': ElasticNet()},
    {'name': 'ridge', 'function': Ridge()},
    #{'name': 'svr', 'function': svm.SVR()},  TRES LONG ET MAUVAIS RESULTAT
    {'name': 'sgd_regressor', 'function': SGDRegressor(loss='huber', penalty='elasticnet', alpha=0.001)},
    {'name': 'gbr', 'function': GradientBoostingRegressor()},
    {'name': 'ebm', 'function': ExplainableBoostingRegressor()}
]

## Grid search on different models

In [11]:
for i, model in enumerate(models):
    scores = cross_val_score(model['function'], df, y, cv=5, scoring='neg_mean_absolute_error')
    model['cross_val_score'] = scores.mean()
    
display( pd.DataFrame(models))

Unnamed: 0,name,function,cross_val_score
0,dummy,"DummyRegressor(constant=None, quantile=None, s...",-0.104587
1,linear_regressor,"LinearRegression(copy_X=True, fit_intercept=Tr...",-0.088592
2,decision_tree,"DecisionTreeRegressor(criterion='mse', max_dep...",-0.090932
3,lasso,"Lasso(alpha=1.0, copy_X=True, fit_intercept=Tr...",-0.104587
4,elastic_net,"ElasticNet(alpha=1.0, copy_X=True, fit_interce...",-0.104587
5,ridge,"Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...",-0.088592
6,sgd_regressor,"SGDRegressor(alpha=0.001, average=False, early...",-23.273655
7,gbr,"GradientBoostingRegressor(alpha=0.9, criterion...",-0.084226
8,ebm,ExplainableBoostingRegressor(binning_strategy=...,-0.09235


In [126]:
pipe = Pipeline(steps=[('scale_data', scaler), ('sgd', model)])
param_grid = {'loss':['huber', 'epsilon_insensitive', 'squared_loss'], 'penalty':['elasticnet'], 'alpha':10.0**-np.arange(1,7)}
search = GridSearchCV(pipe, param_grid, n_jobs=-1)

In [129]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [139]:
classifier_pipeline = make_pipeline(StandardScaler(), SGDRegressor(loss='huber', penalty='elasticnet', alpha=0.001))
scores = cross_val_score(classifier_pipeline, df, y, cv=10)
scores.mean()


0.06239053880238467

In [62]:
clf = GridSearchCV(SGDRegressor(), parameters_sgd)

In [63]:
grid_result = clf.fit(X_train, y_train)



In [64]:
grid_result.best_params_


{'alpha': 0.0001, 'loss': 'huber', 'penalty': 'elasticnet'}