In [1]:
import numpy as np
import pandas as pd

### Import the data and turn the columns to lowercase to make them easier to work with

In [2]:
train = pd.read_csv('data/application_train.csv')

In [3]:
train.columns = map(str.lower, train.columns)

In [4]:
test = pd.read_csv('data/application_test.csv')

In [5]:
test.columns = map(str.lower, test.columns)

### Create copies of the data to use for the first model

In [6]:
train_1 = train.copy()
test_1 = test.copy()

In [7]:
train_1.shape

(307511, 122)

In [8]:
len(train_1)

307511

### Getting rid of the extra categorical values that aren't in both datasets
                          (see EDA workbook)

#### First, find the indexes

In [9]:
delete_from_train_columns = ['code_gender', 'name_income_type', 'name_family_status']
train_values_to_delete = ['XNA', 'Maternity leave', 'Unknown']

train_indexes_to_delete = []
for i, e in zip(delete_from_train_columns, train_values_to_delete):
    bad_train_data = train_1[i] == e
    train_indexes_to_delete.extend(train_1.index[bad_train_data].tolist())
print((f'Train indexes: {train_indexes_to_delete}'))



delete_from_test_columns = ['name_income_type', 'organization_type']
test_values_to_delete = ['Unemployed', 'XNA']

test_indexes_to_delete = []
for i, e in zip(delete_from_test_columns, test_values_to_delete):
    bad_test_data = test_1[i] == e
    test_indexes_to_delete.extend(test_1.index[bad_test_data].tolist())
print((f'Test indexes (length): {len(test_indexes_to_delete)} rows'))

# Dropped from test: name_income_type_Unemployed
# Dropped from test: organization_type_XNA

Train indexes: [35657, 38566, 83382, 189640, 48949, 109612, 218269, 291432, 295458, 41982, 187348]
Test indexes (length): 9275 rows


In [10]:
# test_1['name_income_type']

#### Check that we have the right rows
**Note 'XNA', 'Maternity leave', and 'Unknown' in train data**

In [11]:
train_1.iloc[train_indexes_to_delete].loc[:,['code_gender', 'name_income_type', 'name_family_status']]



Unnamed: 0,code_gender,name_income_type,name_family_status
35657,XNA,Working,Married
38566,XNA,Working,Married
83382,XNA,Working,Married
189640,XNA,Commercial associate,Civil marriage
48949,M,Maternity leave,Married
109612,F,Maternity leave,Married
218269,F,Maternity leave,Married
291432,F,Maternity leave,Married
295458,F,Maternity leave,Married
41982,M,Commercial associate,Unknown


In [12]:
train_1.drop(train_indexes_to_delete, inplace=True)

#### Note 'XNA' and 'Unknown' in test data
#### However, since there are 9,274 rows with XNA in the organization type column, let's delete just that data later after using get_dummies, so we can preserve the rest of those data in those rows. 

In [13]:
test_1.iloc[test_indexes_to_delete].loc[:,['name_income_type']]
test_1['name_income_type'].value_counts()


Working                 24533
Commercial associate    11402
Pensioner                9273
State servant            3532
Student                     2
Businessman                 1
Unemployed                  1
Name: name_income_type, dtype: int64

In [14]:
test_1.iloc[test_indexes_to_delete].loc[:,['organization_type']]
test_1['organization_type'].value_counts()


Business Entity Type 3    10840
XNA                        9274
Self-employed              5920
Other                      2707
Medicine                   1716
Government                 1508
Business Entity Type 2     1479
Trade: type 7              1303
School                     1287
Construction               1039
Kindergarten               1038
Business Entity Type 1      887
Transport: type 4           884
Trade: type 3               578
Military                    530
Industry: type 9            499
Industry: type 3            489
Security                    472
Transport: type 2           448
Police                      441
Housing                     435
Industry: type 11           416
Bank                        374
Security Ministries         341
Services                    302
Postal                      294
Agriculture                 292
Restaurant                  284
Trade: type 2               242
University                  221
Industry: type 7            217
Industry

In [15]:
# test_1.drop(test_indexes_to_delete, inplace=True)

### Getting rid of all the rows that have more than 50% null values

In [16]:
null_threshold = 0.5

column_list = train_1.isnull().sum(axis = 0) / train_1.count(axis = 0)

train_1 = train_1.loc[:,list(column_list.apply(lambda x: x < null_threshold))]

#column_list.apply(lambda x: x < 0.5).head(50)

### Dropping null values from the columns with categorical data

In [17]:
object_columns = [column for column in train_1.columns if train_1[column].dtype=='object']

# train_copy_1 = train_copy_1.loc[train_copy_1[object_columns].notnull()]

train_1.dropna(subset=object_columns, inplace=True)
# train_copy_1['amt_req_credit_bureau_hour'].value_counts()

In [18]:
train_1.shape

(210207, 73)

In [19]:
test_1.shape

(48744, 121)

In [20]:
# Create a list of the target values to use as labels
labels = train_1['target'].values

In [21]:
len(labels)

210207

In [22]:
# Need to make the columns the same in the training and testing datasets
train_columns = list(train_1.columns)
test_columns = list(test_1.columns)

In [23]:
for column in train_columns:
    if column not in test_columns:
#         if column == 'target':
#             pass
#         else:
        print(f'Dropped from train: {column}')
        train_1.drop(column, axis=1, inplace=True)
        
for column in test_columns:
    if column not in train_columns:
        print(f'Dropped from test: {column}')
        test_1.drop(column, axis=1, inplace=True)

Dropped from train: target
Dropped from test: own_car_age
Dropped from test: ext_source_1
Dropped from test: apartments_avg
Dropped from test: basementarea_avg
Dropped from test: years_beginexpluatation_avg
Dropped from test: years_build_avg
Dropped from test: commonarea_avg
Dropped from test: elevators_avg
Dropped from test: entrances_avg
Dropped from test: floorsmax_avg
Dropped from test: floorsmin_avg
Dropped from test: landarea_avg
Dropped from test: livingapartments_avg
Dropped from test: livingarea_avg
Dropped from test: nonlivingapartments_avg
Dropped from test: nonlivingarea_avg
Dropped from test: apartments_mode
Dropped from test: basementarea_mode
Dropped from test: years_beginexpluatation_mode
Dropped from test: years_build_mode
Dropped from test: commonarea_mode
Dropped from test: elevators_mode
Dropped from test: entrances_mode
Dropped from test: floorsmax_mode
Dropped from test: floorsmin_mode
Dropped from test: landarea_mode
Dropped from test: livingapartments_mode
Dropp

In [24]:
features = list(train_1.columns)
len(features)

72

### Use get_dummies to turn categorical variables into binary columns

In [25]:
train_1.shape

(210207, 72)

In [26]:
test_1.shape

(48744, 72)

In [27]:
train_1 = pd.get_dummies(train_1)

In [28]:
test_1 = pd.get_dummies(test_1)

In [29]:
train_1.shape

(210207, 179)

In [30]:
test_1.shape

(48744, 181)

### There are now a couple extra columns in test because it had categorical variables that train didn't have.
**These should be the variables in organization type and income type that we decided not to delete the rows of before**

In [31]:
train_columns2 = list(train_1.columns)
test_columns2 = list(test_1.columns)
for column in train_columns2:
    if column not in test_columns2:
#         if column == 'target':
#             pass
#         else:
        print(f'Dropped from train: {column}')
        train_1.drop(column, axis=1, inplace=True)
        
for column in test_columns2:
    if column not in train_columns2:
        print(f'Dropped from test: {column}')
        test_1.drop(column, axis=1, inplace=True)

Dropped from test: name_income_type_Unemployed
Dropped from test: organization_type_XNA


#### Now they're the same size

In [32]:
train_1.shape

(210207, 179)

In [33]:
test_1.shape

(48744, 179)

### Use imputer to replace remaining nulls in numerical data with the mean of their respective columns

In [34]:
train_columns_3 = list(train_1.columns)
test_columns_3 = list(test_1.columns)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(train_1)
train_1 = pd.DataFrame(imputer.transform(train_1))
train_1.columns = train_columns_3

imputer.fit(test_1)
test_1 = pd.DataFrame(imputer.transform(test_1))
test_1.columns = test_columns_3

In [35]:
train_1.head()

Unnamed: 0,sk_id_curr,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,...,organization_type_Trade: type 3,organization_type_Trade: type 4,organization_type_Trade: type 5,organization_type_Trade: type 6,organization_type_Trade: type 7,organization_type_Transport: type 1,organization_type_Transport: type 2,organization_type_Transport: type 3,organization_type_Transport: type 4,organization_type_University
0,100002.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046.0,-225.0,-4260.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005.0,-3039.0,-9833.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932.0,-3038.0,-4311.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Double check if any nulls in train data
print('Column  |   Nulls')
for column in list(train_1.columns):
    if train_1[column].isnull().sum() > 0:
        print(column,  '|',  train_1[column].isnull().sum())

Column  |   Nulls


In [37]:
train_1.head()

Unnamed: 0,sk_id_curr,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,...,organization_type_Trade: type 3,organization_type_Trade: type 4,organization_type_Trade: type 5,organization_type_Trade: type 6,organization_type_Trade: type 7,organization_type_Transport: type 1,organization_type_Transport: type 2,organization_type_Transport: type 3,organization_type_Transport: type 4,organization_type_University
0,100002.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046.0,-225.0,-4260.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005.0,-3039.0,-9833.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932.0,-3038.0,-4311.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
train_1.dtypes.value_counts()

float64    179
dtype: int64

In [39]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [40]:
scaler.fit(train_1)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [41]:
scaler.transform(train_1)
scaler.transform(test_1)

array([[-2.80699391e-06,  0.00000000e+00,  9.34820326e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.42098172e-06,  0.00000000e+00,  6.27060548e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 3.08769330e-05,  0.00000000e+00,  1.51186991e-03, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 9.99910176e-01,  5.26315789e-02,  1.51186991e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 9.99912983e-01,  0.00000000e+00,  1.70421977e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 9.99985965e-01,  0.00000000e+00,  9.34820326e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [42]:
train_1.shape

(210207, 179)

In [43]:
test_1.shape

(48744, 179)

In [44]:
train_1.head()

Unnamed: 0,sk_id_curr,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,...,organization_type_Trade: type 3,organization_type_Trade: type 4,organization_type_Trade: type 5,organization_type_Trade: type 6,organization_type_Trade: type 7,organization_type_Transport: type 1,organization_type_Transport: type 2,organization_type_Transport: type 3,organization_type_Transport: type 4,organization_type_University
0,100002.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046.0,-225.0,-4260.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005.0,-3039.0,-9833.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932.0,-3038.0,-4311.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
test_1.head()

Unnamed: 0,sk_id_curr,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,...,organization_type_Trade: type 3,organization_type_Trade: type 4,organization_type_Trade: type 5,organization_type_Trade: type 6,organization_type_Trade: type 7,organization_type_Transport: type 1,organization_type_Transport: type 2,organization_type_Transport: type 3,organization_type_Transport: type 4,organization_type_University
0,100001.0,0.0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241.0,-2329.0,-5170.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005.0,0.0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064.0,-4469.0,-9118.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100013.0,0.0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038.0,-4458.0,-2175.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,100028.0,2.0,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976.0,-1866.0,-2000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100038.0,1.0,180000.0,625500.0,32067.0,625500.0,0.010032,-13040.0,-2191.0,-4000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
test_1.shape

(48744, 179)

### One more check for nulls in test data

In [47]:
test_1.isnull().sum()

sk_id_curr                             0
cnt_children                           0
amt_income_total                       0
amt_credit                             0
amt_annuity                            0
                                      ..
organization_type_Transport: type 1    0
organization_type_Transport: type 2    0
organization_type_Transport: type 3    0
organization_type_Transport: type 4    0
organization_type_University           0
Length: 179, dtype: int64

In [48]:
print('Column  |   Nulls')
for column in list(test_1.columns):
    if test_1[column].isnull().sum() > 0:
        print(column,  '|',  test_1[column].isnull().sum())

Column  |   Nulls


#### Making a copy to run a traintestsplit and determine accuracy of models 1 and 2

In [49]:
# For logistic regression
train_1_split = train_1.copy()

# For logistic regression with gridsearch
train_2_split = train_1.copy()

# For random forrest
train_3_split = train_1.copy()

# For
train_4_split = train_1.copy()

#### Making a copy of the data so that we can rerun the regression using different models

In [50]:
# For logistic regression with gridsearch
train_2 = train_1.copy()
test_2 = test_1.copy()

# For random forrest
train_3 = train_1.copy()
test_3 = train_1.copy()

# For
train_4 = train_1.copy()
test_4 = train_1.copy()

## Running the first version of the model

### This first part is for the official submission

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
LRM = LogisticRegression()

In [53]:
print(len(train_1))
print(len(labels))

210207
210207


In [54]:
LRM.fit(train_1, labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
LRM_prediction = LRM.predict_proba(test_1)[:, 1]

In [56]:
solution_1 = pd.DataFrame(test_1[['sk_id_curr']])
solution_1['target'] = LRM_prediction

# solution_1['target'].value_counts()

In [57]:
solution_1.head()

Unnamed: 0,sk_id_curr,target
0,100001.0,0.090598
1,100005.0,0.096364
2,100013.0,0.059543
3,100028.0,0.030318
4,100038.0,0.081135


### Now run version 1 on the train_test_split of the training dataset to check performance

In [58]:
from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(train_1_split, labels, test_size=0.25, random_state=42)

In [59]:
LRM.fit(X_train_1, y_train_1)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
predictions_1 = LRM.predict(X_test_1)

In [61]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
print("Model 1 (Basic logistic regression) Classification Report:")
print(classification_report(y_test_1, predictions_1,
                            target_names=['No default', 'Default']))
print('\n\n')

print("Model 1 (Basic logistic regression) Accuracy Score:")
print(accuracy_score(y_test_1, LRM.predict(X_test_1)))
print('\n\n')

print("Model 1 (Basic logistic regression) AUC Score:")
print(roc_auc_score(y_test_1, LRM.predict_proba(X_test_1)[:,1]))

Model 1 (Basic logistic regression) Classification Report:
              precision    recall  f1-score   support

  No default       0.91      1.00      0.95     47895
     Default       0.00      0.00      0.00      4657

    accuracy                           0.91     52552
   macro avg       0.46      0.50      0.48     52552
weighted avg       0.83      0.91      0.87     52552




Model 1 (Basic logistic regression) Accuracy Score:
0.9113830111128025



Model 1 (Basic logistic regression) AUC Score:
0.6264590404852538


  'precision', 'predicted', average, warn_for)


# Now we're going to try another logistic regression using grid search to improve the regression parameters

### Below is the official submission of the second model version

In [63]:
from sklearn.model_selection import GridSearchCV

grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_LRM = GridSearchCV(LRM, param_grid = grid_values,scoring = 'recall', n_jobs=-1)
grid_LRM.fit(train_1, labels)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.001, 0.009, 0.01, 0.09, 1, 5, 10, 25],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [64]:
grid_LRM_prediction = grid_LRM.predict_proba(test_2)[:, 1]

In [65]:
solution_2 = pd.DataFrame(test_1[['sk_id_curr']])
solution_2['target'] = grid_LRM_prediction

In [66]:
solution_2.head()

Unnamed: 0,sk_id_curr,target
0,100001.0,0.072988
1,100005.0,0.221456
2,100013.0,0.038188
3,100028.0,0.032357
4,100038.0,0.100813


### Now rerun the second model on a train_test_split to check the performance

In [85]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(train_2_split, labels, test_size=0.25, random_state=42)

In [86]:
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
LRM_split_grid = GridSearchCV(LRM, param_grid = grid_values,scoring = 'roc_auc', n_jobs=-1)
LRM_split_grid.fit(X_train_2, y_train_2)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.001, 0.009, 0.01, 0.09, 1, 5, 10, 25],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [87]:
predictions_2 = LRM_split_grid.predict(X_test_2)

In [88]:
print("Model 2 (Logistic regression using grid search) Classification Report:")
print(classification_report(y_test_2, predictions_2,
                            target_names=['No default', 'Default']))
print('\n\n')

print("Model 2 (Logistic regression using grid search) Accuracy Score:")
print(accuracy_score(y_test_2, LRM_split_grid.predict(X_test_2)))
print('\n\n')

print("Model 2 (Logistic regression using grid search) AUC Score:")
print(roc_auc_score(y_test_2, LRM_split_grid.predict_proba(X_test_2)[:,1]))

Model 2 (Logistic regression using grid search) Classification Report:
              precision    recall  f1-score   support

  No default       0.91      1.00      0.95     47895
     Default       0.48      0.01      0.02      4657

    accuracy                           0.91     52552
   macro avg       0.69      0.50      0.49     52552
weighted avg       0.87      0.91      0.87     52552




Model 2 (Logistic regression using grid search) Accuracy Score:
0.91128786725529



Model 2 (Logistic regression using grid search) AUC Score:
0.7433466885893991


**The performance in model 2 is better than model 1**

## Version 3--using a random forrest algorithm

In [71]:
from sklearn.ensemble import RandomForestClassifier

In [72]:
RFM = RandomForestClassifier(n_estimators=100, max_leaf_nodes=None, n_jobs=None)

In [73]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(train_3_split, labels, test_size=0.25, random_state=42)

In [74]:
RFM.fit(X_train_3, y_train_3)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [75]:
RFM_predictions = RFM.predict(X_test_3)

In [76]:
print("Model 3 (random forrest) Classification Report:")
print(classification_report(y_test_3, RFM_predictions,
                            target_names=['No default', 'Default']))
print('\n\n')

print("Model 3 (random forrest) Accuracy Score:")
print(accuracy_score(y_test_1, RFM_predictions))
print('\n\n')

print("Model 3 (random forrest) AUC Score:")
print(roc_auc_score(y_test_3, RFM.predict_proba(X_test_3)[:,1]))

Model 3 (random forrest) Classification Report:
              precision    recall  f1-score   support

  No default       0.91      1.00      0.95     47895
     Default       1.00      0.00      0.00      4657

    accuracy                           0.91     52552
   macro avg       0.96      0.50      0.48     52552
weighted avg       0.92      0.91      0.87     52552




Model 3 (random forrest) Accuracy Score:
0.9114210686558076



Model 3 (random forrest) AUC Score:
0.7134299241798866


## Version 4--random forrest algorithm with gridsearch

In [77]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(train_4_split, labels, test_size=0.25, random_state=42)

In [78]:
RFM_param_grid = { 
    'n_estimators': [20, 50, 100],
    'max_features': [1,3,10, 'auto'], #['auto', 'sqrt', 'log2'],
    'max_depth' : [5,8,10,15],
#     'min_samples_leaf' : [1,3,5],
    'criterion' : ['gini', 'entropy'],
    'bootstrap' : [True, False]
}

In [79]:
RFM_2 = RandomForestClassifier()
grid_RFM = GridSearchCV(RFM_2, param_grid=RFM_param_grid, scoring='roc_auc', cv=3, n_jobs=-1, verbose=3)


In [80]:
print(len(X_train_4))
print(len(y_train_4))

157655
157655


In [81]:
grid_RFM.fit(X_train_4, y_train_4)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed: 14.3min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [82]:
grid_RFM_predictions = grid_RFM.predict(X_test_4)

In [83]:
print("Model 4 (random forrest with gridsearch) Classification Report:")
print(classification_report(y_test_4, grid_RFM_predictions,
                            target_names=['No default', 'Default']))
print('\n\n')

print("Model 4 (random forrest with gridsearch) Accuracy Score:")
print(accuracy_score(y_test_4, grid_RFM_predictions))
print('\n\n')

print("Model 4 (random forrest with gridsearch) AUC Score:")
print(roc_auc_score(y_test_4, grid_RFM.predict_proba(X_test_4)[:,1]))

Model 4 (random forrest with gridsearch) Classification Report:
              precision    recall  f1-score   support

  No default       0.91      1.00      0.95     47895
     Default       0.00      0.00      0.00      4657

    accuracy                           0.91     52552
   macro avg       0.46      0.50      0.48     52552
weighted avg       0.83      0.91      0.87     52552




Model 4 (random forrest with gridsearch) Accuracy Score:
0.9113830111128025



Model 4 (random forrest with gridsearch) AUC Score:


  'precision', 'predicted', average, warn_for)


0.7382156672215496
