In [34]:
# General
import pandas as pd
import numpy as np
from math import sqrt

# Machine learning
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, confusion_matrix, recall_score, roc_curve, precision_score, make_scorer
from sklearn.linear_model import SGDClassifier, LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn import svm, tree
from sklearn.model_selection import GridSearchCV

# Vizu

# Tracking
import mlflow
import mlflow.sklearn

In [27]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [2]:
def create_target(df):
    df['cible'] = df['Sitrep Cible'].apply(lambda x: bool(x))
    df = df.dropna(subset=['cible'])
    df = df.fillna(0)  # CAREFUL
    y = df['cible']
    del df['cible']
    #scaler = StandardScaler()
    #scaled_df = scaler.fit_transform(df)
    #df = pd.DataFrame(scaled_df, columns=df.columns)
    
    col_genre_nav = [col for col in df if col.startswith('genre_nav')]
    return df[['Annee','Prescriptions', 'Prescriptions Majeurs', 'At', 'Sitrep'] + col_genre_nav], y

In [3]:
df = pd.read_csv('/home/anaconda/workspace/dataset_3.csv')
del df['Type Essence']

onehot = pd.get_dummies(df['Genre Navigation'], prefix='genre_nav', prefix_sep='_')
df = df.join(onehot)
del df['Genre Navigation']


df,y = create_target(df)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=42)

In [64]:
df

Unnamed: 0,Annee,Prescriptions,Prescriptions Majeurs,At,Sitrep,genre_nav_6.0,genre_nav_18.0,genre_nav_19.0,genre_nav_20.0,genre_nav_21.0,...,genre_nav_28.0,genre_nav_29.0,genre_nav_30.0,genre_nav_31.0,genre_nav_33.0,genre_nav_34.0,genre_nav_35.0,genre_nav_36.0,genre_nav_40.0,genre_nav_41.0
0,2018,17,1,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2018,3,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018,2,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018,7,1,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2018,5,2,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55880,2012,0,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55881,2012,0,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55882,2012,0,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55883,2012,0,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Dummy

In [19]:
with mlflow.start_run():
    dumb = DummyClassifier(strategy='most_frequent')
    dumb.fit(X_train, y_train)

    predicted_qualities = dumb.predict(X_test)

    #(rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    display(confusion_matrix(y_test, predicted_qualities))
    display(f1_score(y_test, predicted_qualities))
    print("Dummy Regressor - Mean")
    #print("  RMSE: %s" % rmse)
    #print("  MAE: %s" % mae)
    #print("  R2: %s" % r2)

    #mlflow.log_metric("rmse", rmse)
    #mlflow.log_metric("r2", r2)
    #mlflow.log_metric("mae", mae)
    
    mlflow.set_tags({'model':'dummy', 'type': 'regression', 'strategy': 'mean'})
    mlflow.sklearn.log_model(dumb, "model")

array([[17441,     0],
       [ 1002,     0]])

  'precision', 'predicted', average, warn_for)


0.0

Dummy Regressor - Mean


## Linear Regressor

In [47]:
with mlflow.start_run():
    linear = LinearRegression()
    linear.fit(X_train, y_train)

    predicted_qualities = linear.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    
    mlflow.set_tags({'model':'linear', 'type': 'regression', })
    mlflow.sklearn.log_model(linear, "model")

  RMSE: 0.286841298066724
  MAE: 0.10660717022610681
  R2: 0.262366628810901


## Decision Tree

In [48]:
with mlflow.start_run():
    model = tree.DecisionTreeRegressor()
    model.fit(X_train, y_train)

    predicted_qualities = model.predict(X_test)
    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

    print("Decision Tree")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.set_tags({'model':'Decision Tree Regressor', 'type': 'regression',})

    mlflow.sklearn.log_model(sgd, "model")

Decision Tree
  RMSE: 0.3794298369646511
  MAE: 0.11617368599358012
  R2: -0.29068468376221035


## Lasso

In [49]:
with mlflow.start_run():
    model = Lasso()
    model.fit(X_train, y_train)

    predicted_qualities = model.predict(X_test)
    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

    print("Lasso")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.set_tags({'model':'Lasso', 'type': 'regression',})

    mlflow.sklearn.log_model(sgd, "model")

Lasso
  RMSE: 0.333982809729749
  MAE: 0.13303487662428853
  R2: -1.2493989399064986e-05


## Elastic Net

In [50]:
with mlflow.start_run():
    model = ElasticNet()
    model.fit(X_train, y_train)

    predicted_qualities = model.predict(X_test)
    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

    print("Elastic Net")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.set_tags({'model':'ElasticNet', 'type': 'regression',})

    mlflow.sklearn.log_model(sgd, "model")

Elastic Net
  RMSE: 0.3233912342775673
  MAE: 0.12994301068863295
  R2: 0.06240844747403396


## Ridge


In [51]:
with mlflow.start_run():
    model = Ridge()
    model.fit(X_train, y_train)

    predicted_qualities = model.predict(X_test)
    display(predicted_qualities)
    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

    print("Ridge")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.set_tags({'model':'Ridge', 'type': 'regression',})

    mlflow.sklearn.log_model(sgd, "model")

array([ 0.021883  ,  0.02706608,  0.02706608, ..., -0.00208043,
       -0.00208043,  0.00569418])

Ridge
  RMSE: 0.2868398709857516
  MAE: 0.10660906256145126
  R2: 0.26237396847877403


## SVR

In [None]:
with mlflow.start_run():
    model = svm.SVR()
    model.fit(X_train, y_train)

    predicted_qualities = model.predict(X_test)
    display(predicted_qualities)
    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

    print("SVR")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    if False:
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.set_tags({'model':'SVR', 'type': 'regression',})

        mlflow.sklearn.log_model(sgd, "model")



## SGD Regressor

In [72]:
with mlflow.start_run():
    sgd = SGDClassifier(loss='huber', penalty='elasticnet', alpha=0.0001)
    sgd.fit(X_train, y_train)

    predicted_qualities = sgd.predict(X_test)

    display(confusion_matrix(y_test, predicted_qualities))
    display(f1_score(y_test, predicted_qualities))


array([[ 5634, 11807],
       [   75,   927]])

0.13497379149679675

In [26]:
params = {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 
         'penalty': ['elasticnet'],
         'alpha': [0.0001, 0.001, 0.1]}

In [38]:
sgd = SGDClassifier()
clf = GridSearchCV(sgd, params, cv=5, scoring=make_scorer(recall_score))
clf.fit(df, y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validat...
                                     warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.0001, 0.001, 0.1],
                         'loss': ['hinge', 'log', 'modified_huber',
                                  'squared_hinge', 'perceptron', 'squared_loss',
                  

In [39]:
clf.cv_results_

{'mean_fit_time': array([1.09899206, 1.12486942, 1.64839323, 1.25442522, 0.74540765,
        0.6748631 , 0.24980791, 0.85038438, 0.73880119, 0.40261447,
        1.70019927, 0.42824354, 1.30449862, 0.38897421, 0.94801848,
        0.48980448, 0.26120801, 0.83558865, 0.23909888, 1.44521108,
        0.12855959, 4.48636351, 0.24113464, 0.77705395, 0.29009993,
        0.20320332, 0.99975119]),
 'std_fit_time': array([0.10382769, 0.14664613, 0.26901732, 0.34448274, 0.08966697,
        0.16262888, 0.02744758, 0.16806333, 0.27399535, 0.13621827,
        0.83017051, 0.11936832, 0.20689996, 0.14296039, 0.34644969,
        0.14728378, 0.04769905, 0.24231619, 0.11455216, 0.25364499,
        0.01513481, 0.69241716, 0.06915215, 0.28879896, 0.06793728,
        0.06564658, 0.3419644 ]),
 'mean_score_time': array([0.00297968, 0.00271378, 0.00296574, 0.00283122, 0.0029166 ,
        0.00286984, 0.00280097, 0.0027457 , 0.0028023 , 0.00278468,
        0.00293007, 0.00301592, 0.00282211, 0.00292969, 0.002833

In [40]:
display(clf.best_params_)
clf.best_score_

{'alpha': 0.1, 'loss': 'squared_loss', 'penalty': 'elasticnet'}

0.8000178938892368

SGD très intable . C1,1 de 0 à 992

In [73]:
display(recall_score(y_test, predicted_qualities))



0.9251497005988024

In [None]:
    print("SGDR")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_param("loss", 'huber')
    mlflow.log_param("penalty", 'elasticnet')
    mlflow.log_param("alpha", 0.001)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.set_tags({'model':'SGD', 'type': 'regression',})

    mlflow.sklearn.log_model(sgd, "model")


Les deux fonctions de loss : 'squared epsilon insensitive' et 'squared_loss' divergent quelque soit la valeur d'alpha.

Les deux fonctions de loss ' 'epsilon_insensitive' et 'huber' convergent.

## GBR

In [20]:
with mlflow.start_run():
    gbr = GradientBoostingClassifier()
    gbr.fit(X_train, y_train)

    predicted_qualities = gbr.predict(X_test)

    display(confusion_matrix(y_test, predicted_qualities))
    display(f1_score(y_test, predicted_qualities))


array([[17393,    48],
       [  927,    75]])

0.1333333333333333