In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [3]:
# df = pd.read_pickle("../data/prod/scaler_data.pkl")
df = pd.read_pickle("../data/prod/scaler_data_standard.pkl")

In [4]:
X = df.drop("loan_status", axis=1)
y = df["loan_status"]

In [5]:
y.value_counts()

1.0    967612
0.0    262965
Name: loan_status, dtype: int64

In [6]:
def process_model(ml, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    ml.fit(X_train, y_train)
    y_pred = ml.predict(X_test)
    print("Accuracy => ", accuracy_score(y_test,y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(f1_score(y_test, y_pred))
    return ml

In [7]:
lm = process_model(LogisticRegression(), X, y)



Accuracy =>  0.990803794416725
              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98     79056
         1.0       0.99      1.00      0.99    290118

    accuracy                           0.99    369174
   macro avg       0.99      0.98      0.99    369174
weighted avg       0.99      0.99      0.99    369174

[[ 76462   2594]
 [   801 289317]]
0.9941669573165599


In [8]:
# for c in range(2,16):
#     lrcv = LogisticRegressionCV(cv=c)
#     lrcv = process_model(lrcv, X, y)
#     print("C => ", c)

In [9]:
y_pred = lm.predict(X)

In [10]:
y_pred_prob = lm.predict_proba(X)

In [11]:
ids_df = pd.read_pickle("../data/prod/ids.pkl")

In [12]:
ids_df.head()

Unnamed: 0,id
0,68407277
1,68355089
2,68341763
4,68476807
5,68426831


In [13]:
len(ids_df), len(y_pred), len(y_pred_prob)

(1230577, 1230577, 1230577)

In [14]:
type(y_pred), type(y_pred_prob), type(ids_df.values)

(numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [15]:
ids_df.values.flatten()

array([68407277, 68355089, 68341763, ..., '89996426', '90006534',
       '88224441'], dtype=object)

In [16]:
y_pred

array([1., 1., 1., ..., 0., 1., 0.])

In [17]:
y_pred_prob

array([[1.65130525e-01, 8.34869475e-01],
       [3.58642354e-04, 9.99641358e-01],
       [1.27249201e-07, 9.99999873e-01],
       ...,
       [1.00000000e+00, 1.27634945e-65],
       [2.36615535e-06, 9.99997634e-01],
       [1.00000000e+00, 2.23159420e-46]])

In [18]:
y_pred_prob[:, 1]

array([8.34869475e-01, 9.99641358e-01, 9.99999873e-01, ...,
       1.27634945e-65, 9.99997634e-01, 2.23159420e-46])

In [19]:
y_pred_prob[:, 0]

array([1.65130525e-01, 3.58642354e-04, 1.27249201e-07, ...,
       1.00000000e+00, 2.36615535e-06, 1.00000000e+00])

In [20]:
combine = { 
            "id": ids_df.values.flatten(),
            "actual": y.values,
            "predict": y_pred,
            "default": y_pred_prob[:, 0],
            "none_default": y_pred_prob[:, 1]
          }

In [21]:
final_df = pd.DataFrame(combine)

In [22]:
final_df.head()

Unnamed: 0,id,actual,predict,default,none_default
0,68407277,1.0,1.0,0.1651305,0.834869
1,68355089,1.0,1.0,0.0003586424,0.999641
2,68341763,1.0,1.0,1.272492e-07,1.0
3,68476807,1.0,1.0,9.765632e-06,0.99999
4,68426831,1.0,1.0,4.739667e-05,0.999953


In [30]:
from sqlalchemy import create_engine

In [31]:
engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
                       .format(user="admin",
                               pw="password",
                               db="loan"))

In [37]:
final_df = final_df.set_index("id")

In [38]:
final_df.head()

Unnamed: 0_level_0,actual,predict,default,none_default
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
68407277,1.0,1.0,0.1651305,0.834869
68355089,1.0,1.0,0.0003586424,0.999641
68341763,1.0,1.0,1.272492e-07,1.0
68476807,1.0,1.0,9.765632e-06,0.99999
68426831,1.0,1.0,4.739667e-05,0.999953


In [39]:
final_df.to_sql("loan", engine, chunksize=1000, if_exists = 'append')