# Data Scientist - P7 - Laurent Trichet

## Implémentez un modèle de scoring

## 4 Generate prediction pipeline and deployment model with MLflow

### Import required libraries

In [1]:
# Import default libraries
import pandas as pd
import numpy as np

# Import Garbage Collector (empty dataFrame memory)
import gc

# Import Imbalanced-learn necessary tools
import imblearn
from collections import Counter

# import for classification GradientBoostingClassifier & SVC
from sklearn import ensemble

# Import evaluation tool for classification optimisations
from sklearn.model_selection import GridSearchCV

# Import mlflow, serialization and model server
import mlflow.sklearn
import mlflow.pyfunc
from mlflow.models.signature import infer_signature

# tools for execution time estimates
from datetime import datetime

# Remove some warnings
import warnings
warnings.filterwarnings('ignore')

# Constants
DIRDATASET = './credithome_datasets/'
NUMROWS = 15000       # 1000000 = total dataset
# File names with NUMROWS lines, Fill nan with zeros and important features only
FILESTD_FNAN0_REDUCED = DIRDATASET+'Credit_Home_Junction_Std_Fnan0_Reduced_'+str(NUMROWS)+'.csv'


### 4.1 Load training and test sets, apply correction of imbalanced classes

#### Load data

In [2]:
df = pd.read_csv(FILESTD_FNAN0_REDUCED, encoding='Latin-1', sep='\t')

# Retrieve train and test datasets
df_train = df[df['TARGET']!=999]
df_test = df[df['TARGET']==999]
# Keep valid columns for features and result class in future classifications
c_features = [c for c in df.columns if c not in ['index', 'TARGET', 'SK_ID_CURR']]
c_class = 'TARGET'

del df
gc.collect()


140

#### Under sampling for imbalanced data

In [3]:
counter1 = Counter(df_train[c_class])
print(counter1)

Counter({1.0: 13826, 0.0: 1174})


In [4]:
undersample = imblearn.under_sampling.RandomUnderSampler(random_state=0)
df_X, df_y = undersample.fit_resample(df_train[c_features], df_train[c_class])
X = df_X.values
y = df_y.values

del df_X, df_y
gc.collect()

counter2 = Counter(y)
print(counter2)

Counter({0.0: 1174, 1.0: 1174})


### 4.2 Search for best hyperparameters

#### Pre-training, search best params

In [None]:
models=[]
iname, itype, iparam = 0, 1, 2
models.append(['GradBoostC', ensemble.GradientBoostingClassifier(),
               {
                'n_estimators': [200],
                'max_depth': [3],
                'criterion': ['friedman_mse'],
                'min_samples_split': [2, 3],
                'min_weight_fraction_leaf': [0.0, 0.2, 0.4],
                 }
               ])
for i, model in enumerate(models):
    mdl = GridSearchCV(model[itype], model[iparam], cv=5, scoring='roc_auc')
    datedeb = datetime.now()
    mdl.fit(X, y)
    duree = datetime.now() - datedeb
    print(f'{model[iname]} \tduree: {duree.seconds}s')

In [None]:
print(f'\tbest_score: {mdl.best_score_:4.3} \tbest_params: {mdl.best_params_}')

#### Result for 15000 training dataset : best_score: 0.733 	best_params: {'criterion': 'friedman_mse', 'max_depth': 3, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200}
#### Result for full dataset : best_score: 0.772 	best_params: {'criterion': 'friedman_mse', 'max_depth': 3, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200}

In [None]:
test_prob = mdl.predict_proba(df_test[c_features])
dtf = pd.DataFrame(test_prob)
dtf.describe()

### 4.3 Creation of a prediction pipeline

In [5]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('classifier',
                   ensemble.GradientBoostingClassifier(
                                n_estimators=200,
                                max_depth=3,
                                criterion='friedman_mse',
                                min_samples_split = 2,
                                min_weight_fraction_leaf = 0.0,
                            )
                 )
                ]
               )

In [6]:
pipe.fit(X, y)

Pipeline(steps=[('classifier', GradientBoostingClassifier(n_estimators=200))])

In [7]:
del dtf
gc.collect()

NameError: name 'dtf' is not defined

### 4.4 Deployment API of sklearn model with MLflow

In [8]:
signature = infer_signature(X, y)
signature


inputs: 
  [Tensor('float64', (-1, 213))]
outputs: 
  [Tensor('float64', (-1,))]

##### Check that folder `./credithome_model` is deleted before lauching save_model

In [9]:
class ModelOut (mlflow.pyfunc.PythonModel):
     def __init__(self, model):
          self.model = model
     def predict (self, context, model_input):
          # here we can update the input
          # e.g model_input.columns= map(str.lower,model_input.columns)
          return self.model.predict_proba(model_input)[:,1]

mlflow.pyfunc.save_model(python_model=ModelOut(model=pipe,),
                         path='credithome_model',
                         signature=signature,
                        )

### 4.5 shell operations to launch MLflow server on the mlflow_model

AWS SERVER (ssh) :  
`cd environments/env_p7/projects/exercices`  
`mlflow models serve -m mlflow_model -h 0.0.0.0 --env-manager local`
> parameters :  
> `-h 0.0.0.0` to authorize http requests from anywhere on the internet (outside server)  
> `--env-manager local` to use the default python environment and not a default conda environment

LAPTOP (Anaconda powershell, environment Base) :  
`cd '.\OneDrive\OpenClassrooms\Parcours Data Scientist\P7 Implementez Modele Scoring\Code\HomeCredit\API\'`  
`mlflow models serve -m credithome_model -h 0.0.0.0 --env-manager local`


### 4.6 Test of the API (check if IP or server in model url needs to be changed)

In [None]:
import requests

headers = {"Content-Type": "application/json"}

# Test API AWS
# model_url = 'http://ec2-18-118-129-10.us-east-2.compute.amazonaws.com:5000/invocations'

# Test API LAPTOP
model_url = 'http://localhost:5000/invocations'

In [None]:
# data = df_test[c_features].iloc[np.random.randint(1,df_test[c_features].shape[0]),:].values
data = df_test[c_features].iloc[0,:].to_list()

In [None]:
max_val = 4
tab_rep = np.zeros(max_val)
for i in np.arange(0, max_val):
    data = df_test[c_features].iloc[30+i,:].to_list()
    data_json = {'data': [data]}
    response = requests.request(method='POST',
                                headers=headers,
                                url=model_url,
                                json=data_json
                            )
    if response.status_code != 200:
        print(f'HTTP error: {response.status_code}')
    else:
        tab_rep[i] = response.json()[0]
        print(f'{i:3} / {max_val-1:3} {response.json()}        ', end='\r')
print('\n')

In [None]:
dtf = pd.DataFrame(tab_rep)
dtf.describe()