In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

%matplotlib inline
pd.options.mode.chained_assignment = None

In [53]:
def train_test(df):
    ''' on utilise year == 2022 pour constituer le dataset de test'''
    
    train = df.iloc[:106952]
    test =  df.iloc[106953:]
    
    return train, test

In [55]:
def rf_modelisation(x_train, y_train):
    ''' determination des hyperparametre de RF'''
    weights = np.linspace(0.1,0.9,100)
    params = [{
        "n_estimators": [10, 100,150],
        "max_features": [2, 4, 8,10,12],
        "class_weight":[{0:x, 1:1.0-x} for x in weights]
        }]

    rfCV = GridSearchCV(
        RandomForestClassifier(),
        params,
        scoring="f1_micro",
        cv=5,
        n_jobs=-1,
        return_train_score=True)
    
    rfCV = rfCV.fit(x_train, y_train)
    
    return rfCV.best_estimator_

#
def logist_modelisation(x_train, y_train):
    # grille de valeurs
    weights = np.linspace(0.1,0.9,100)

    params = [{"C": [0.01, 0.2, 0.5, 1, 5, 10, 20],
           "penalty": ["l1", "l2","none"],
           "class_weight":[{0:x, 1:1.0-x} for x in weights]
          }]

    logitCV = GridSearchCV(
        LogisticRegression(solver='liblinear'),
        params,
        scoring="f1_micro",
        cv=5,
        n_jobs=-1,
        return_train_score=True)
    
    logitCV = logitCV.fit(x_train, y_train)
    
    return logitCV.best_estimator_

#
def OneSVM_modelisation(x_train, y_train):
    # grille de valeurs

    params = [{"nu": [0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.9], 
              'max_iter': [100,250, 500, 700, 900],
           #"class_weight":[{0:x, 1:1.0-x} for x in weights]
          }]

    outlierCV = GridSearchCV(
        OneClassSVM(),
        params,
        scoring="f1_micro",
        cv=5,
        n_jobs=-1,
        return_train_score=True)
    
    outlierCV = outlierCV.fit(x_train, y_train)
    
    
    return outlierCV.best_estimator_

def DT_modelisation(x_train, y_train):
    # grille de valeurs
    weights = np.linspace(0.1,0.9,100)

    params = [{
        "max_depth": [3, 5, 10, 15,None],
        "min_samples_split": [2, 5, 10,15,20,30],
        "min_samples_leaf": [1, 2, 5,10,15,20,30],
        "class_weight":[{0:x, 1:1.0-x} for x in weights]
        }]

    dtCV = GridSearchCV(
        DecisionTreeClassifier(),
        params,
        scoring="f1_micro",
        cv=5,
        n_jobs=-1,
        return_train_score=True)
    
    dtCV = dtCV.fit(x_train, y_train)
    
    return dtCV.best_estimator_
  

In [56]:
# Chargement des données
data = pd.read_csv("data/all_features.csv", sep=';',low_memory=False)
label = pd.read_csv("data/descriptif_hiver_ete.csv", sep=';',low_memory=False)
print(len(data))


# definition de train et test 
# definition de x et y 
x_train, x_test = train_test(data.iloc[:,1:])
train_target,test_target = train_test(label)
y_test = test_target['baignade']
y_train = train_target['baignade']


111482


In [57]:
## Logistic Regression
model_logist = logist_modelisation(x_train, y_train)
model_logist

In [45]:
## decision tree
model_dt = DT_modelisation(x_train, y_train)
model_dt


In [30]:


## Random forest
model_rf = rf_modelisation(x_train, y_train)
model_rf



In [None]:
import joblib
joblib.dump(model_dt, "saved_models/decision_tree.joblib")
joblib.dump(model_rf, "saved_models/random_forest.joblib")
joblib.dump(model_logist, "saved_models/reg_logist.joblib")

# Evaluation

In [33]:

## RF
model_rf.fit(x_train, y_train)

# prediction
y_train_predict = model_rf.predict(x_train)
y_test_predict = model_rf.predict(x_test)

# Evaluation
print("-----------------------Training data-----------------------")
print(classification_report(y_train, y_train_predict))
print("-------------------------Test data-------------------------")
print(classification_report(y_test, y_test_predict))

-----------------------Training data-----------------------
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     29638
           1       0.43      0.98      0.60       393

    accuracy                           0.98     30031
   macro avg       0.72      0.98      0.80     30031
weighted avg       0.99      0.98      0.99     30031

-------------------------Test data-------------------------
              precision    recall  f1-score   support

           0       0.88      1.00      0.93      3976
           1       0.00      0.00      0.00       553

    accuracy                           0.88      4529
   macro avg       0.44      0.50      0.47      4529
weighted avg       0.77      0.88      0.82      4529



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
model_logist.fit(x_train, y_train)

# prediction
y_train_predict = model_logist.predict(x_train)
y_test_predict = model_logist.predict(x_test)

# Evaluation
print("-----------------------Training data-----------------------")
print(classification_report(y_train, y_train_predict))
print("-------------------------Test data-------------------------")
print(classification_report(y_test, y_test_predict))

-----------------------Training data-----------------------
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     29638
           1       0.08      0.11      0.10       393

    accuracy                           0.97     30031
   macro avg       0.54      0.55      0.54     30031
weighted avg       0.98      0.97      0.97     30031

-------------------------Test data-------------------------
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      3976
           1       0.35      0.21      0.26       553

    accuracy                           0.86      4529
   macro avg       0.62      0.58      0.59      4529
weighted avg       0.83      0.86      0.84      4529



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
model_dt.fit(x_train, y_train)

# prediction
y_train_predict = model_dt.predict(x_train)
y_test_predict = model_dt.predict(x_test)

# Evaluation
print("-----------------------Training data-----------------------")
print(classification_report(y_train, y_train_predict))
print("-------------------------Test data-------------------------")
print(classification_report(y_test, y_test_predict))

-----------------------Training data-----------------------
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     29638
           1       0.53      0.02      0.04       393

    accuracy                           0.99     30031
   macro avg       0.76      0.51      0.52     30031
weighted avg       0.98      0.99      0.98     30031

-------------------------Test data-------------------------
              precision    recall  f1-score   support

           0       0.88      1.00      0.93      3976
           1       0.00      0.00      0.00       553

    accuracy                           0.87      4529
   macro avg       0.44      0.50      0.47      4529
weighted avg       0.77      0.87      0.82      4529



['saved_models/decision_tree.joblib']