In [48]:
import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.insert(0, parent_dir)

In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import plotly.express as px
from configuration.config import DATA_PATH

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support

In [51]:
# import train/test splits for modeling the reduced and robust scaled data
X_train_rr_c = pd.read_csv(DATA_PATH + 'modeling/X_train_rr_c.csv')
X_test_rr_c = pd.read_csv(DATA_PATH + 'modeling/X_test_rr_c.csv')

# import train/test splits for modeling the reduced and power transformed data
X_train_rp_c = pd.read_csv(DATA_PATH + 'modeling/X_train_rp_c.csv')
X_test_rp_c = pd.read_csv(DATA_PATH + 'modeling/X_test_rp_c.csv')

y_train_c = pd.read_csv(DATA_PATH + 'modeling/y_train_c.csv').values.ravel()
y_test_c = pd.read_csv(DATA_PATH + 'modeling/y_test_c.csv').values.ravel()

In [52]:
logreg = LogisticRegression(penalty='l2', max_iter=500, multi_class='ovr', solver='liblinear')
logreg.fit(X_train_rr_c, y_train_c)

In [53]:
logreg_pred_prob = pd.DataFrame(logreg.predict_proba(X_test_rr_c))
logreg_pred = pd.DataFrame(logreg.predict(X_test_rr_c))
y_test_df = pd.DataFrame(y_test_c)


In [54]:
logreg_pred = pd.concat([y_test_df, logreg_pred_prob, logreg_pred], axis=1, ignore_index=True)

In [55]:
print(logreg_pred.shape)

(402, 10)


In [56]:
logreg_pred.columns = ['Actual','200K - 1M', '1M - 1.8M', '1.8M - 2.6M', '2.6M - 3.4M', '3.4M - 4.2M', '4.2M - 5M', '5M - 8.7M', 'Outlier','Y_pred']
logreg_pred.head()

Unnamed: 0,Actual,200K - 1M,1M - 1.8M,1.8M - 2.6M,2.6M - 3.4M,3.4M - 4.2M,4.2M - 5M,5M - 8.7M,Outlier,Y_pred
0,1.8M - 2.6M,0.173536,0.108105,0.21173,0.005694,0.226561,0.116303,0.135106,0.022965,3.4M - 4.2M
1,1.8M - 2.6M,0.18822,0.007658,0.189807,0.001303,0.295997,0.037974,0.273167,0.005874,3.4M - 4.2M
2,1.8M - 2.6M,0.069188,0.022921,0.076963,0.000629,0.251215,0.0893,0.482704,0.00708,5M - 8.7M
3,3.4M - 4.2M,0.207846,0.023187,0.206564,0.013034,0.21434,0.089532,0.231065,0.014431,5M - 8.7M
4,Outlier,0.003957,0.016893,0.075568,0.000138,0.135889,0.092842,0.624756,0.049958,5M - 8.7M


In [57]:
pd.crosstab(logreg_pred['Y_pred'],logreg_pred['Actual']) 
pd.crosstab(logreg_pred['Y_pred'],logreg_pred['Actual']).apply(lambda x: x/x.sum(), axis=1).round(2)


Actual,1.8M - 2.6M,1M - 1.8M,2.6M - 3.4M,200K - 1M,3.4M - 4.2M,4.2M - 5M,5M - 8.7M,Outlier
Y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.8M - 2.6M,0.2,0.15,0.34,0.04,0.17,0.04,0.06,0.0
1M - 1.8M,0.26,0.39,0.18,0.08,0.04,0.02,0.03,0.0
2.6M - 3.4M,0.1,0.15,0.41,0.0,0.05,0.15,0.13,0.0
200K - 1M,0.33,0.0,0.33,0.33,0.0,0.0,0.0,0.0
3.4M - 4.2M,0.25,0.1,0.25,0.0,0.15,0.1,0.15,0.0
4.2M - 5M,0.0,0.0,0.0,0.0,0.12,0.12,0.62,0.12
5M - 8.7M,0.06,0.01,0.08,0.03,0.15,0.12,0.41,0.14
Outlier,0.06,0.0,0.0,0.0,0.0,0.12,0.19,0.62


In [47]:
# Modeling
dt_model = DecisionTreeClassifier(random_state=0).fit(X_train_rp_c, y_train_c)

# Prediction
def Prediction(model):
    pred_prob = pd.DataFrame(model.predict_proba(X_test_rp_c))
    pred = pd.DataFrame(model.predict(X_test_rp_c))
    pred = pd.concat([y_test_df,pred_prob,pred],axis=1, ignore_index=True)
    pred.columns = ['Actual','200K - 1M', '1M - 3.8M', '3.8M - 4.6M', '4.6M - 6M', '6M - 8.7M', 'Outlier', 'Y_pred']
    return(pred)
dt_pred = Prediction(dt_model)  

# Actual vs. Prediction
pd.crosstab(dt_pred['Y_pred'],dt_pred['Actual']).apply(lambda x: x/x.sum(), axis=1).round(2)

Actual,1M - 3.8M,200K - 1M,3.8M - 4.6M,4.6M - 6M,6M - 8.7M,Outliers
Y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1M - 3.8M,0.76,0.06,0.07,0.07,0.03,0.01
200K - 1M,0.8,0.16,0.0,0.0,0.04,0.0
3.8M - 4.6M,0.4,0.0,0.2,0.34,0.03,0.03
4.6M - 6M,0.44,0.0,0.22,0.19,0.11,0.03
6M - 8.7M,0.19,0.0,0.06,0.25,0.29,0.21
Outliers,0.23,0.0,0.04,0.08,0.19,0.46
