# Library 


In [31]:
#library
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import random as rnd

from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

# Importing Models
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Importing other tools
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.metrics import accuracy_score, recall_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# from tpot import TPOTClassifier
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold
import warnings
from sklearn.preprocessing import MinMaxScaler
# import plotly.express as px
import itertools
from mlflow import sklearn
import mlflow.sklearn
import random

# Pre-Process data

In [32]:
# loading local functions
from src.features.build_features import prep_process

In [33]:
data = pd.read_csv("C:\mlops_plugin\src\data\data.csv")

In [34]:
# define the mlflow local server - Ideally set this server on remote server(Aws, Azure or GCP - for experiments - free tier will work)
# local server setup will not store artificate( models & data files) - you need remote server or database for that
# refer here : https://www.mlflow.org/docs/latest/quickstart.html
import mlflow
mlflow.set_tracking_uri("http://kubernetes.docker.internal:5000/") 

In [35]:
exp_name = "Pre_Process"
mlflow.set_experiment(exp_name)

In [36]:
x_train,x_test,y_train,y_test = prep_process(data)
mlflow.end_run()

In [10]:
x_train.head()

Unnamed: 0,Pclass,Sex,Embarked,Title,IsAlone,FareBand,AgeBand
759,1,1,0,5,1,3,2
378,3,0,1,1,1,0,0
696,3,0,0,1,1,1,3
222,3,0,0,1,1,1,3
107,3,0,0,1,1,0,1


In [11]:
# setting up modeling experiment on MLFlow
exp_name = "Modeling"
mlflow.set_experiment(exp_name)

In [12]:
basic_params = {
  'seed':123,
  'folds': 10,
  'test_size': 0.3,
  'train_size': 0.7,
  'n_jobs':-1,
  'verbose':3,
  'scoring':'f1',   
  'Split_type': 'ShuffleSplit',
  'warning': warnings.filterwarnings('ignore'),
  'normalize': True
}

base_model = {
    'AdaBoostClassifier': AdaBoostClassifier(),
    'BaggingClassifier':BaggingClassifier(),
    'ExtraTreesClassifier':ExtraTreesClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'GaussianProcessClassifier':gaussian_process.GaussianProcessClassifier(),
    'LogisticRegressionCV':linear_model.LogisticRegressionCV(),
    'PassiveAggressiveClassifier':linear_model.PassiveAggressiveClassifier(),
    'RidgeClassifierCV':linear_model.RidgeClassifierCV(),
    'SGDClassifier':linear_model.SGDClassifier(),
    'Perceptron':linear_model.Perceptron(),
    'BernoulliNB':naive_bayes.BernoulliNB(),
    'GaussianNB':naive_bayes.GaussianNB(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'SVC':svm.SVC(probability=True),
    'NuSVC':svm.NuSVC(probability=True),
    'LinearSVC':svm.LinearSVC(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'LinearDiscriminantAnalysis':discriminant_analysis.LinearDiscriminantAnalysis(),
    'QuadraticDiscriminantAnalysis':discriminant_analysis.QuadraticDiscriminantAnalysis(),
    'XGBClassifier':XGBClassifier(),
    'LGBMClassifier':LGBMClassifier()   
    
}


In [14]:
class BaseModellingHelper:

    def __init__(self, std_param, base_model):
        
        self.std_param = std_param
        if self.std_param['Split_type'] == 'ShuffleSplit':
          self.cross_val = model_selection.ShuffleSplit(n_splits = self.std_param['folds'], test_size = self.std_param['test_size'], train_size = self.std_param['train_size'], random_state = self.std_param['seed'] )

        self.base_model = base_model
        self.base_model_output = {}
        self.feature_importance_df_sorted = pd.DataFrame()
        self.important_col =[]

        self.scoring = { 'accuracy' : make_scorer(metrics.accuracy_score),
                  'precision' : make_scorer(metrics.precision_score),
                  'recall' : make_scorer(metrics.recall_score),
                  'f1_score' : make_scorer(metrics.f1_score),
                  'average_precision': make_scorer(metrics.average_precision_score),
                  'balanced_accuracy': make_scorer(metrics.balanced_accuracy_score),
                  'hamming_loss':make_scorer(metrics.hamming_loss),
                  'jaccard_score': make_scorer(metrics.jaccard_score),
                  'log_loss': make_scorer(metrics.log_loss),
                  'roc_auc_score':make_scorer(metrics.roc_auc_score),
                  'zero_one_loss':make_scorer(metrics.zero_one_loss,normalize=False)
                  }


        self.scores_list = []
        self.feature_importance = {}
        self.FeatureImportanceAlgo = ['DecisionTreeClassifier','RandomForestClassifier','ExtraTreesClassifier','GradientBoostingClassifier','AdaBoostClassifier']

    def getScoreDictionary(self, base_model_output,modelname, basemodel_scores, score_type):
      self.base_model_output[modelname]['Time'] = basemodel_scores['fit_time'].mean()
      self.base_model_output[modelname]['%s_accuracy' %score_type ] =  basemodel_scores['%s_accuracy' %score_type].mean()
      self.base_model_output[modelname]['%s_precision' %score_type ] =  basemodel_scores['%s_precision' %score_type].mean()
      self.base_model_output[modelname]['%s_recall' %score_type ] =  basemodel_scores['%s_recall' %score_type].mean()
      self.base_model_output[modelname]['%s_f1_score' %score_type ] =  basemodel_scores['%s_f1_score' %score_type].mean()
      self.base_model_output[modelname]['%s_average_precision' %score_type ] =  basemodel_scores['%s_average_precision' %score_type].mean()
      self.base_model_output[modelname]['%s_balanced_accuracy' %score_type ] =  basemodel_scores['%s_balanced_accuracy' %score_type].mean()
      self.base_model_output[modelname]['%s_hamming_loss' %score_type ] =  basemodel_scores['%s_hamming_loss' %score_type].mean()
      self.base_model_output[modelname]['%s_jaccard_score' %score_type ] =  basemodel_scores['%s_jaccard_score' %score_type].mean()
      self.base_model_output[modelname]['%s_log_loss' %score_type ] =  basemodel_scores['%s_log_loss' %score_type].mean()
      self.base_model_output[modelname]['%s_roc_auc_score' %score_type ] =  basemodel_scores['%s_roc_auc_score' %score_type].mean()
      self.base_model_output[modelname]['%s_zero_one_loss' %score_type ] =  basemodel_scores['%s_zero_one_loss' %score_type].mean()

      return None

    def ModelLoop(self,X, y, score_type=None):
        
      for key, eachModel in self.base_model.items():
          with mlflow.start_run(nested=True):
              basemodel_scores = model_selection.cross_validate(eachModel, X,y, cv  = self.cross_val,return_train_score=True,scoring=self.scoring, pre_dispatch="2*n_jobs",return_estimator=True)
              modelname = eachModel.__class__.__name__
              self.base_model_output[modelname] = {}
              self.getScoreDictionary(self.base_model_output,modelname, basemodel_scores, 'train')
              self.getScoreDictionary(self.base_model_output,modelname, basemodel_scores, 'test')
              self.scores_list.append(basemodel_scores)
              mlflow.log_param("Model", modelname)
              mlflow.log_param("time", self.base_model_output[modelname]['Time'])
              score_type_mlflow = 'test'
              mlflow.log_metric("accuracy", basemodel_scores['%s_accuracy' %score_type_mlflow].mean())
              mlflow.log_metric("Precision", basemodel_scores['%s_precision' %score_type_mlflow].mean())
              mlflow.log_metric("Recall", basemodel_scores['%s_recall' %score_type_mlflow].mean())
              mlflow.log_metric("F1", basemodel_scores['%s_f1_score' %score_type_mlflow].mean())
              model = eachModel.fit(X,y)
              modelpath = "C://mlops_plugin//models//model-%s%f" % (modelname,random.random())
              mlflow.sklearn.save_model(model, modelpath)

              if eachModel.__class__.__name__ in self.FeatureImportanceAlgo:
                eachModel.fit(X,y)
                self.feature_importance[eachModel.__class__.__name__]= eachModel.feature_importances_
            

    def runBaseLineModel(self, X, y, score_type=None, auto_feature_eng = None , top_feature = None ):
      if top_feature:
        print ("Building model with only %s important feature" % top_feature)
        #Initial Model Loop to extract top feature
        self.ModelLoop(X, y, score_type)
        imp_df = self.getFeatureImportance(self.getFeatureImportanceDF(X, self.feature_importance))
        important_col = list(imp_df[:top_feature].index)
        self.important_col = important_col
        X = X[important_col]
        self.ModelLoop(X, y,score_type)
        # tracking 

      else:
        print ("Building model without any important feature")
        self.ModelLoop(X, y,score_type)

    def getFeatureImportanceDF(self, X, feature_importance_dict, important_col=None):
      if important_col:
        feature_names = important_col
        feat_imp_df = pd.DataFrame.from_dict(feature_importance_dict)
        feat_imp_df.index = feature_names
        return feat_imp_df
      else:
        feature_names = X.columns
        feat_imp_df = pd.DataFrame.from_dict(feature_importance_dict)
        feat_imp_df.index = feature_names
        return feat_imp_df

    def getFeatureImportance(self,feat_imp_df):
      mms = MinMaxScaler()
      # scaling to MinMax Scale
      scaled_fi = pd.DataFrame(data=mms.fit_transform(feat_imp_df),columns=feat_imp_df.columns,index=feat_imp_df.index)
      # Adding all values of importance to get single socre
      scaled_fi['SumofImp'] = scaled_fi.sum(axis=1)
      # print(scaled_fi.head())
      ordered_ranking = scaled_fi.sort_values('SumofImp', ascending=False)
      return ordered_ranking


    def getFeatureImportanceGraph(self,ordered_feature_importance_df):
      self.feature_importance_df_sorted.append(ordered_feature_importance_df)
      fig, ax = plt.subplots(figsize=(10,7), dpi=80)
      sns.barplot(data=ordered_feature_importance_df, y=ordered_feature_importance_df.index, x='SumofImp', palette='magma')
      ax.spines['right'].set_visible(False)
      ax.spines['top'].set_visible(False)
      ax.spines['bottom'].set_visible(False)
      ax.xaxis.set_visible(False)
      ax.grid(False)
      ax.set_title('Aggregated Feature Importances for Models');
      return ax

In [15]:
ModelObject = BaseModellingHelper(basic_params,base_model)

In [16]:
%time ModelObject.runBaseLineModel(x_train,y_train,score_type='test',top_feature=5)
mlflow.end_run()

Building model with only 5 important feature
Wall time: 1min 54s


In [40]:
# dumping results in pickle for streamlit Dashbaord
import pickle
pickle_out = open("C:\mlops_plugin\src\models\pickle_temp\ModelBaseObject.pickle","wb")
pickle.dump(ModelObject.base_model_output, pickle_out)
pickle_out.close()

In [42]:
# dumping important feature df to pickle
import pickle
pickle_out = open("C:\mlops_plugin\src\models\pickle_temp\ModelBaseObject_featureimpdf.pickle","wb")
imprtant_feature_df = ModelObject.getFeatureImportance(ModelObject.getFeatureImportanceDF(x_train, ModelObject.feature_importance, ModelObject.important_col))
pickle.dump(imprtant_feature_df, pickle_out)
pickle_out.close()

In [17]:
ModelObject.base_model_output['BaggingClassifier']

{'Time': 0.06555182933807373,
 'train_accuracy': 0.8481927710843372,
 'train_precision': 0.8279109550199907,
 'train_recall': 0.7581919438499525,
 'train_f1_score': 0.7901486145107256,
 'train_average_precision': 0.7176756355740715,
 'train_balanced_accuracy': 0.8300568577103077,
 'train_hamming_loss': 0.15180722891566267,
 'train_jaccard_score': 0.6532013719762055,
 'train_log_loss': 5.243284584854182,
 'train_roc_auc_score': 0.8300568577103077,
 'train_zero_one_loss': 75.6,
 'test_accuracy': 0.794392523364486,
 'test_precision': 0.7545932715898391,
 'test_recall': 0.6693334952903214,
 'test_f1_score': 0.7057037196913832,
 'test_average_precision': 0.6261328713512292,
 'test_balanced_accuracy': 0.7699038195664563,
 'test_hamming_loss': 0.20560747663551404,
 'test_jaccard_score': 0.5456602160703631,
 'test_log_loss': 7.101496421919821,
 'test_roc_auc_score': 0.7699038195664564,
 'test_zero_one_loss': 44.0}

In [None]:
#! cd C:\mlops_plugin\src\visualization
#! streamlit run baseline_model.py
# On another tab - 
#! cd C:\mlops_plugin\src\visualization
#! streamlit run model_interpretation.py

In [None]:
# Production API 
{
  "Pclass": 3,
  "Sex": 1,
  "Embarked": 1,
  "Title": 2,
  "IsAlone": 1,
  "FareBand": 0,
  "AgeBand": 0
}


In [18]:
x_train.to_csv("C:/mlops_plugin/src/data/x_train.csv")

In [40]:
# after all the analysis, you can train your final model and put it here:
model = BaggingClassifier()
model.fit(x_train,y_train)
pickle.dump(model, open("C://mlops_plugin//_deploy//model//model.pkl", 'wb'))

In [None]:
# Head over to C:\mlops_plugin\_deploy
# uvicorn main:app
# this will start the model server at http://127.0.0.1:8000