In [1]:
import mlflow as mlf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
rel_columns = [
    'LIMIT_BAL',
    'SEX',
    'EDUCATION',
    'MARRIAGE',
    'AGE',
    'PAY_0',
    'PAY_2',
    'PAY_3',
    'PAY_4',
    'PAY_5',
    'PAY_6',
    'default.payment.next.month'
]

In [3]:
df_ccd = pd.read_csv('./UCI_Credit_Card.csv', usecols=rel_columns)

In [4]:
df_ccd.head(10)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,1
1,120000.0,2,2,2,26,-1,2,0,0,0,2,1
2,90000.0,2,2,2,34,0,0,0,0,0,0,0
3,50000.0,2,2,1,37,0,0,0,0,0,0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,0,0
5,50000.0,1,1,2,37,0,0,0,0,0,0,0
6,500000.0,1,1,2,29,0,0,0,0,0,0,0
7,100000.0,2,2,2,23,0,-1,-1,0,0,-1,0
8,140000.0,2,3,1,28,0,0,2,0,0,0,0
9,20000.0,1,3,2,35,-2,-2,-2,-2,-1,-1,0


In [5]:
df_ccd.dtypes

LIMIT_BAL                     float64
SEX                             int64
EDUCATION                       int64
MARRIAGE                        int64
AGE                             int64
PAY_0                           int64
PAY_2                           int64
PAY_3                           int64
PAY_4                           int64
PAY_5                           int64
PAY_6                           int64
default.payment.next.month      int64
dtype: object

In [6]:
df_ccd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   LIMIT_BAL                   30000 non-null  float64
 1   SEX                         30000 non-null  int64  
 2   EDUCATION                   30000 non-null  int64  
 3   MARRIAGE                    30000 non-null  int64  
 4   AGE                         30000 non-null  int64  
 5   PAY_0                       30000 non-null  int64  
 6   PAY_2                       30000 non-null  int64  
 7   PAY_3                       30000 non-null  int64  
 8   PAY_4                       30000 non-null  int64  
 9   PAY_5                       30000 non-null  int64  
 10  PAY_6                       30000 non-null  int64  
 11  default.payment.next.month  30000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 2.7 MB


In [7]:
df_ccd.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,0.2212
std,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,0.415062
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,1.0


In [8]:
for col in ['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']:
    df_ccd[col] = df_ccd[col].apply(lambda x: 0 if x <0 else x)

In [9]:
df_ccd

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,0,0,0,0,1
1,120000.0,2,2,2,26,0,2,0,0,0,2,1
2,90000.0,2,2,2,34,0,0,0,0,0,0,0
3,50000.0,2,2,1,37,0,0,0,0,0,0,0
4,50000.0,1,2,1,57,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000.0,1,3,1,39,0,0,0,0,0,0,0
29996,150000.0,1,3,2,43,0,0,0,0,0,0,0
29997,30000.0,1,2,2,37,4,3,2,0,0,0,1
29998,80000.0,1,3,1,41,1,0,0,0,0,0,1


In [10]:
df_ccd.iloc[:, 0:-1]

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
0,20000.0,2,2,1,24,2,2,0,0,0,0
1,120000.0,2,2,2,26,0,2,0,0,0,2
2,90000.0,2,2,2,34,0,0,0,0,0,0
3,50000.0,2,2,1,37,0,0,0,0,0,0
4,50000.0,1,2,1,57,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
29995,220000.0,1,3,1,39,0,0,0,0,0,0
29996,150000.0,1,3,2,43,0,0,0,0,0,0
29997,30000.0,1,2,2,37,4,3,2,0,0,0
29998,80000.0,1,3,1,41,1,0,0,0,0,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df_ccd.iloc[:, 0:-1],
    df_ccd.iloc[:, -1],
    test_size=0.2,
    random_state=100
)

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train, 
    test_size=0.25,
    random_state=100
)

In [15]:
mlf.set_experiment("Random-Forest")

2023/11/21 06:37:25 INFO mlflow.tracking.fluent: Experiment with name 'Random-Forest' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/devendragovil/AllClassesMaterial/MLOps_Nico/Project/mlops-demo-project/mlruns/184022289043057730', creation_time=1700577445567, experiment_id='184022289043057730', last_update_time=1700577445567, lifecycle_stage='active', name='Random-Forest', tags={}>

In [16]:
count = 0
n_estimators_permute = [100, 250, 500, 1000]
max_depth_permute = [None, 3, 8]
for n in n_estimators_permute:
    for d in max_depth_permute:
        with mlf.start_run(run_name=f"run-{count}"):
            rf_clf = RandomForestClassifier(n_estimators=n, max_depth=d, random_state=100)
            rf_clf.fit(X_train, y_train)
            mlf.log_param(key='n_estimators', value=n)
            mlf.log_param(key='max_depth', value=d)
            pred_vals = rf_clf.predict(X_val)
            auc_val = roc_auc_score(y_val, pred_vals)
            pred_train = rf_clf.predict(X_train)
            auc_train = roc_auc_score(y_train, pred_train)
            mlf.log_metric(key='roc_auc_val', value=auc_val)
            mlf.log_metric(key='roc_auc_train', value=auc_train)
            count += 1
        mlf.end_run()

In [17]:
mlf.set_experiment(experiment_name="LogisticRegression")

2023/11/21 06:39:44 INFO mlflow.tracking.fluent: Experiment with name 'LogisticRegression' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/devendragovil/AllClassesMaterial/MLOps_Nico/Project/mlops-demo-project/mlruns/218007512282501820', creation_time=1700577584391, experiment_id='218007512282501820', last_update_time=1700577584391, lifecycle_stage='active', name='LogisticRegression', tags={}>

In [20]:
count = 0
penalties = ['l2', None]
max_iter_permute = [100, 200, 400, 1000]
for n in penalties:
    for d in max_iter_permute:
        with mlf.start_run(run_name=f"run-{count}"):
            lr_clf = LogisticRegression(penalty=n, max_iter=d, random_state=100)
            lr_clf.fit(X_train, y_train)
            mlf.log_param(key='penalty', value=n)
            mlf.log_param(key='max_iter', value=d)
            pred_vals = lr_clf.predict(X_val)
            auc_val = roc_auc_score(y_val, pred_vals)
            pred_train = lr_clf.predict(X_train)
            auc_train = roc_auc_score(y_train, pred_train)
            mlf.log_metric(key='roc_auc_val', value=auc_val)
            mlf.log_metric(key='roc_auc_train', value=auc_train)
            count += 1
        mlf.end_run()

In [22]:
final_model = RandomForestClassifier(n_estimators=250, max_depth=8, random_state=100)

In [23]:
final_model.fit(X_train, y_train)