In [1]:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import StandardScaler
    
    from sklearn.ensemble import RandomForestClassifier
    
    from sklearn.metrics import classification_report
    import pickle
    import os
    import numpy as np

In [3]:
df_raw = pd.read_csv('Dataset.csv')

In [4]:
df = df_raw.copy()

In [5]:
df

Unnamed: 0,Account,Risk Cat,Avg Pay Days,Delayed Amt %,P30DTOA,90DSO,Decision
0,1,Moderate Risk,-16.98,2.91,0.00,13.18,Approved
1,2,Low Risk,-3.53,30.99,38.78,4.71,Approved
2,3,Moderate Risk,7.63,92.00,0.00,34.26,Rejected
3,4,Moderate Risk,-10.82,12.40,0.00,17.43,Approved
4,5,Moderate Risk,15.83,87.73,0.00,31.75,Rejected
...,...,...,...,...,...,...,...
897,898,Moderate Risk,12.69,72.31,0.00,36.80,Rejected
898,899,Moderate Risk,-7.83,31.91,0.00,0.00,Approved
899,900,Moderate Risk,0.00,0.00,0.00,90.00,Rejected
900,901,Moderate Risk,-24.77,0.02,0.93,20.11,Approved


In [6]:
print(df.columns)

Index(['Account', 'Risk Cat', 'Avg Pay Days', 'Delayed Amt %', 'P30DTOA',
       '90DSO', 'Decision'],
      dtype='object')


In [7]:
df = df.drop('Account', axis=1)

In [8]:
print(df.head())

        Risk Cat  Avg Pay Days  Delayed Amt %  P30DTOA  90DSO  Decision
0  Moderate Risk        -16.98           2.91     0.00  13.18  Approved
1       Low Risk         -3.53          30.99    38.78   4.71  Approved
2  Moderate Risk          7.63          92.00     0.00  34.26  Rejected
3  Moderate Risk        -10.82          12.40     0.00  17.43  Approved
4  Moderate Risk         15.83          87.73     0.00  31.75  Rejected


In [9]:
print(df.describe())

       Avg Pay Days  Delayed Amt %     P30DTOA       90DSO
count    902.000000     902.000000  902.000000  902.000000
mean      -2.766807      31.546652    7.698703   22.988337
std       20.130446      34.828138   21.262686   27.098783
min      -62.000000       0.000000   -0.430000    0.000000
25%      -12.630000       0.595000    0.000000    0.000000
50%       -5.940000      17.290000    0.000000   15.475000
75%        2.000000      59.292500    0.250000   32.102500
max      220.000000     100.000000  100.000000   90.000000


In [10]:
print(df.isnull().sum())

Risk Cat         0
Avg Pay Days     0
Delayed Amt %    0
P30DTOA          0
90DSO            0
Decision         0
dtype: int64


In [11]:
X = df.drop('Decision', axis=1)

In [14]:
y = df['Decision'] 
le_enc = LabelEncoder()

In [16]:
y = le_enc.fit_transform(y)
pickle.dump(le_enc, open('le_enc_tgt.sav', 'wb'))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [18]:
ohe_risk_cat = OneHotEncoder(drop='first')
X_train_risk_enc = pd.DataFrame(ohe_risk_cat.fit_transform(X_train[['Risk Cat']]).toarray())

In [19]:
X_train_risk_enc

Unnamed: 0,0,1,2
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
...,...,...,...
806,0.0,1.0,0.0
807,0.0,1.0,0.0
808,1.0,0.0,0.0
809,1.0,0.0,0.0


In [20]:
X_train = X_train.reset_index().drop('index', axis=1)
X_train

Unnamed: 0,Risk Cat,Avg Pay Days,Delayed Amt %,P30DTOA,90DSO
0,Moderate Risk,1.57,55.89,0.00,45.09
1,Moderate Risk,3.67,12.50,0.00,0.00
2,High Risk,-22.39,0.00,0.00,42.91
3,Moderate Risk,-25.54,0.41,0.95,32.42
4,Low Risk,-9.00,0.00,0.00,0.00
...,...,...,...,...,...
806,Moderate Risk,-1.42,40.93,29.66,32.45
807,Moderate Risk,6.11,61.14,46.21,90.00
808,Low Risk,-8.35,4.51,0.00,31.40
809,Low Risk,-11.96,2.29,0.00,0.00


In [21]:
X_train = X_train.join(X_train_risk_enc)
X_train

Unnamed: 0,Risk Cat,Avg Pay Days,Delayed Amt %,P30DTOA,90DSO,0,1,2
0,Moderate Risk,1.57,55.89,0.00,45.09,0.0,1.0,0.0
1,Moderate Risk,3.67,12.50,0.00,0.00,0.0,1.0,0.0
2,High Risk,-22.39,0.00,0.00,42.91,0.0,0.0,0.0
3,Moderate Risk,-25.54,0.41,0.95,32.42,0.0,1.0,0.0
4,Low Risk,-9.00,0.00,0.00,0.00,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
806,Moderate Risk,-1.42,40.93,29.66,32.45,0.0,1.0,0.0
807,Moderate Risk,6.11,61.14,46.21,90.00,0.0,1.0,0.0
808,Low Risk,-8.35,4.51,0.00,31.40,1.0,0.0,0.0
809,Low Risk,-11.96,2.29,0.00,0.00,1.0,0.0,0.0


In [22]:
X_train = X_train.drop('Risk Cat', axis=1)

In [23]:
X_train

Unnamed: 0,Avg Pay Days,Delayed Amt %,P30DTOA,90DSO,0,1,2
0,1.57,55.89,0.00,45.09,0.0,1.0,0.0
1,3.67,12.50,0.00,0.00,0.0,1.0,0.0
2,-22.39,0.00,0.00,42.91,0.0,0.0,0.0
3,-25.54,0.41,0.95,32.42,0.0,1.0,0.0
4,-9.00,0.00,0.00,0.00,1.0,0.0,0.0
...,...,...,...,...,...,...,...
806,-1.42,40.93,29.66,32.45,0.0,1.0,0.0
807,6.11,61.14,46.21,90.00,0.0,1.0,0.0
808,-8.35,4.51,0.00,31.40,1.0,0.0,0.0
809,-11.96,2.29,0.00,0.00,1.0,0.0,0.0


In [24]:
    
    X_test_risk_enc = pd.DataFrame(ohe_risk_cat.transform(X_test[['Risk Cat']]).toarray())
    X_test = X_test.reset_index().drop('index', axis=1)
    X_test = X_test.join(X_test_risk_enc)
    X_test = X_test.drop('Risk Cat', axis=1)
    

In [26]:
    pickle.dump(ohe_risk_cat,  open('ohe_risk_cat.sav', 'wb'))
    
    #Feature Scaling
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_test = std_scaler.transform(X_test)
    pickle.dump(std_scaler,  open('std_scaler.sav', 'wb'))

In [28]:
    rf_classifier = RandomForestClassifier(n_estimators = 200, 
                                           criterion = 'entropy', 
                                           random_state = 0)
    rf_classifier.fit(X_train, y_train)
    pickle.dump(rf_classifier,  open('rf_classifier.sav', 'wb'))
    #Validate using test set
    y_pred = rf_classifier.predict(X_test)
    y_prob = rf_classifier.predict_proba(X_test)
    y_conf = [(np.max(pred_err)*100) for pred_err in y_prob]
    print(y_conf)

[73.0, 96.5, 97.0, 75.5, 100.0, 98.5, 99.0, 81.5, 99.5, 99.5, 98.5, 97.0, 99.5, 97.0, 100.0, 80.5, 100.0, 94.5, 54.0, 100.0, 68.0, 100.0, 100.0, 100.0, 99.0, 100.0, 66.5, 74.0, 98.0, 100.0, 70.5, 98.5, 79.5, 90.0, 85.5, 84.0, 90.0, 79.0, 54.50000000000001, 91.5, 100.0, 100.0, 99.0, 99.5, 97.5, 84.0, 100.0, 77.0, 100.0, 51.0, 89.0, 79.5, 100.0, 99.0, 98.5, 98.5, 56.00000000000001, 99.5, 72.5, 78.0, 53.5, 100.0, 99.0, 100.0, 99.5, 80.5, 85.5, 81.0, 86.5, 92.0, 87.5, 94.0, 68.5, 100.0, 51.0, 95.0, 85.0, 100.0, 99.0, 93.0, 86.5, 89.0, 51.0, 99.5, 77.5, 100.0, 99.5, 92.0, 100.0, 66.0, 99.5]


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        57
           1       0.57      0.67      0.62        12
           2       0.95      0.86      0.90        22

    accuracy                           0.88        91
   macro avg       0.82      0.82      0.82        91
weighted avg       0.89      0.88      0.88        91

