# <img style="float: left; padding-right: 10px; width: 45px" src="https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/iacs.png"> CS109A Introduction to Data Science: 

## Lending Club Project


**Harvard University**<br/>
**Fall 2018**<br/>

<hr style="height:2pt">



In [1]:
#RUN THIS CELL 
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)

In [2]:
import numpy as np
import pandas as pd
import datetime
import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm
from statsmodels.api import OLS

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_blobs

from sklearn.preprocessing import StandardScaler
import time

import math
from scipy.special import gamma

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

import seaborn as sns
sns.set()
import matplotlib.style
matplotlib.style.use('seaborn-whitegrid')
sns.set_style("white")
from IPython.display import display

<hr style="height:2pt">

<div class='theme'> Overview </div>

### This notebook contains the following sections:
* **Part 1**: Cleaning up some data
* **Part 2**: Reducing number of predictors
* **Part 3**: Exploring the data


<div class='exercise'><b> Part 1: Cleaning up some data </b></div>


This notebook uses the cleaned CSV data file downloaded from https://s3.amazonaws.com/109a/data_cleaned_df.csv. <br><br>
We're going to remove some rows that still show NaN values. This will otherwise interfere with fitting some models later.

In [3]:
# increase some display options to display all columns and more rows.
pd.set_option('display.max_columns', None)
pd.options.display.max_rows = 150

In [42]:
# read in the data sets from local files

Q1_2017 = pd.read_csv('../data/LoanStats_2017Q1.csv',low_memory=False,skiprows=1)
Q2_2017 = pd.read_csv('../data/LoanStats_2017Q2.csv',low_memory=False,skiprows=1)
Q3_2017 = pd.read_csv('../data/LoanStats_2017Q3.csv',low_memory=False,skiprows=1)
Q4_2017 = pd.read_csv('../data/LoanStats_2017Q4.csv',low_memory=False,skiprows=1)

Q1_2016 = pd.read_csv('../data/LoanStats_2016Q1.csv',low_memory=False,skiprows=1)
Q2_2016 = pd.read_csv('../data/LoanStats_2016Q2.csv',low_memory=False,skiprows=1)
Q3_2016 = pd.read_csv('../data/LoanStats_2016Q3.csv',low_memory=False,skiprows=1)
Q4_2016 = pd.read_csv('../data/LoanStats_2016Q4.csv',low_memory=False,skiprows=1)

Year_2015 = pd.read_csv('../data/LoanStats3d.csv',low_memory=False,skiprows=1)
Year_2014 = pd.read_csv('../data/LoanStats3c.csv',low_memory=False,skiprows=1)
Year_2012_2013 = pd.read_csv('../data/LoanStats3b.csv',low_memory=False,skiprows=1)


In [43]:
# Concatenate all the data
# Currently uses all years
data_df = pd.concat([Q1_2017,Q2_2017,Q3_2017,Q4_2017,Q1_2016,Q2_2016,Q3_2016,Q4_2016,Year_2015,Year_2014,Year_2012_2013])


In [52]:
# Just 2016 and 2017
data_2016_2017 = pd.concat([Q1_2017,Q2_2017,Q3_2017,Q4_2017,Q1_2016,Q2_2016,Q3_2016,Q4_2016])


**Summary of edits to the cleaning function**
* Remove the following columns, since they would not be known at the time of making the loan:
    * `hardship flag` 
    * `hardship reason`

* Recode NaNs as zero where it makes sense:
    * `inq_last_6mths`
    * `inq_last_12m`
    * `open_acc_6m`
    * `open_act_il`
    * `open_il_12m`
    * `open_il_24m`
    * `open_rv_12m`
    * `open_rv_24m`
    * `inq_fi`
    * `pct_tl_nvr_dlq`
    * `num_accts_ever_120_pd`
    * `num_actv_bc_tl`
    * `num_actv_rev_tl`
    * `num_bc_sats` 
    * `num_bc_tl`
    * `num_il_tl` 
    * `num_op_rev_tl`
    * `num_rev_accts`
    * `num_rev_tl_bal_gt_0`
    * `num_sats`
    * `num_tl_30dpd`
    * `num_tl_90g_dpd_24m`
    * `num_tl_op_past_12m`
    * `acc_open_past_24mths`

* Remove the following columns, as they refer to the current status of the loan (again these would not be known at the time of making the loan) **to check with team**:
    * `all_util`
    * `max_bal_bc`
    * `total_bal_il`
    * `total_cu_tl`
    * `avg_cur_bal`
    * `tot_hi_cred_lim`
    * `total_bal_ex_mort`
    * `total_bc_limit`
    * `total_il_high_credit_limit`
    * `tot_coll_amt`
    * `tot_cur_bal`
    * `total_rev_hi_lim`

* Fill missing values with data from joint application column:
    * `mort_acc` filled from `sec_app_mort_acc` or 0 if that column is also NaN

* Recode other month since cols
    * `mo_sin_old_rev_tl_op`
    * `mo_sin_old_il_acct`
    * `mo_sin_old_rev_tl_op`
    * `mo_sin_rcnt_rev_tl_op`
    * `mo_sin_rcnt_tl`

* Took out function to drop rows - this can be done manually afterwards
* Kept `purpose` and dropped `title`:
    * On closer inspection, `purpose` actually seems cleaner - my bad!


<font color = "red">
Not sure about some of these (pct_tl_nvr_dlq?)


In [44]:
# Edit GG: see above
def clean_up(df):
    
    # Explicitly list the columns by cleaning action
    cols_to_drop = ['id', 'member_id', 'url', 'desc', 
                    'debt_settlement_flag_date', 'settlement_status', 'settlement_amount', 'settlement_percentage', 'settlement_term', 'settlement_date', 
                    'hardship_flag', 'hardship_reason', 'hardship_type', 'hardship_status', 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest', 'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 
                    'policy_code', 'il_util', 'num_tl_120dpd_2m', 'bc_util', 'percent_bc_gt_75', 'bc_open_to_buy', 
                    'emp_title', 'verification_status_joint', 'pymnt_plan', 'disbursement_method',
                    'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'sec_app_earliest_cr_line',
                   'all_util', 'max_bal_bc', 'total_bal_il', 'total_cu_tl', 'avg_cur_bal',
                    'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim',
                   'title']
    date_cols_to_keep = ['issue_d', 'earliest_cr_line']
    fill_zero_cols = ["sec_app_revol_util", "sec_app_collections_12_mths_ex_med", "sec_app_chargeoff_within_12_mths", "sec_app_num_rev_accts", "sec_app_open_act_il", "sec_app_open_acc", "sec_app_mort_acc", "sec_app_inq_last_6mths", "revol_bal_joint", "annual_inc_joint", "dti_joint",
                      'inq_last_6mths', 'inq_last_12m', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'open_rv_12m','open_rv_24m', 'inq_fi', 'pct_tl_nvr_dlq',
                     'num_accts_ever_120_pd', 'num_actv_bc_tl','num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts','num_rev_tl_bal_gt_0','num_sats','num_tl_30dpd','num_tl_90g_dpd_24m','num_tl_op_past_12m', 'acc_open_past_24mths']
    mths_since_cols = ["mths_since_last_record", "mths_since_recent_bc_dlq", "mths_since_last_major_derog", "mths_since_recent_revol_delinq", "mths_since_last_delinq", "mths_since_recent_inq", 
    "mo_sin_old_il_acct", "mths_since_rcnt_il", "mths_since_recent_bc", "sec_app_mths_since_last_major_derog",
                      "mo_sin_old_rev_tl_op", "mo_sin_old_il_acct", "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl"]

    # Columns that still have NaNs - drop the rows
    cols_drop_nan = ["loan_amnt"]

    # Dictionary for recoding employment length
    emp_dict = {"< 1 year":0.5, # Choose 0.5 instead of 0 to distinguish from NaN
                "1 year":1,
                "2 years":2,
                "3 years":3,
                "4 years":4,
                "5 years":5,
                "6 years":6,
                "7 years":7,
                "8 years":8,
                "9 years":9,
                "10+ years":10,
                "n/a":0}
    
    # Remove columns to drop
    df.drop(columns = cols_to_drop, inplace = True)

    # Drop rows which are not equal to 'Fully Paid' or 'Charged Off'
    df = df.loc[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
    
    # Convert date columns to datetime 
    for i in date_cols_to_keep:
        df[i] = pd.to_datetime(df[i])

    # Fill missing values for selected columns
    for i in fill_zero_cols:
        df[i] = df[i].replace(np.nan, 0)
    df["revol_util"].fillna(df["sec_app_revol_util"], inplace = True)
    df["dti"].fillna(df["dti_joint"], inplace = True)
    df["mort_acc"].fillna(df["sec_app_mort_acc"], inplace = True)
    df["mort_acc"] = df["mort_acc"].replace(np.nan, 0)

    # Recode binary variables
    df["term"].replace({" 36 months": 0, " 60 months": 1}, inplace = True)
    df["initial_list_status"].replace({"f": 0, "w": 1}, inplace = True)
    df["debt_settlement_flag"].replace({"N": 0, "Y": 1}, inplace = True)
    df["application_type"].replace({"Individual": 0, "Joint App": 1}, inplace = True)
    for i in mths_since_cols:
        df[i] = df[i].apply(lambda x: 1 if not pd.isnull(x) else 0)

    # Change emp_length to numeric
    df["emp_length"].replace(emp_dict, inplace = True)
    df["emp_length"] = df["emp_length"].replace(np.nan, 0)    

    # Strip out % signs and change to floats
    df["int_rate"] = [x.strip("%") for x in df["int_rate"].astype(str)]
    df["int_rate"] = df["int_rate"].astype(float)
    df["revol_util"] = [x.strip("%") for x in df["revol_util"].astype(str)]
    df["revol_util"] = df["revol_util"].astype(float)
    
   # Find rows to drop
   # rows_to_drop = []
   # for i in cols_drop_nan:
   #     temp = df[df[i].isnull()].index.tolist()
   #     rows_to_drop = rows_to_drop + temp
   # rows_to_drop = set(rows_to_drop)
   # print("Number of rows removed due to NaN: {} ({:.2f}% of rows)".format(len(rows_to_drop),len(rows_to_drop)/len(df) * 100))
    
     # Drop rows with NaN
    #df.drop(rows_to_drop, inplace = True)
    
    return df

In [45]:
df = clean_up(data_df)

In [46]:
display(df.shape)
df.head()

(1070492, 93)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,mths_since_last_major_derog,application_type,annual_inc_joint,dti_joint,acc_now_delinq,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,open_rv_12m,open_rv_24m,inq_fi,inq_last_12m,acc_open_past_24mths,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,pub_rec_bankruptcies,tax_liens,revol_bal_joint,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,debt_settlement_flag
15,14000.0,14000.0,14000.0,1,15.99,340.38,C,C5,10.0,RENT,43000.0,Source Verified,2017-03-01,Charged Off,debt_consolidation,367xx,AL,21.8,1.0,1995-10-01,0.0,1,0,3.0,0.0,18537.0,99.1,8.0,1,0.0,0.0,4804.1,4804.1,1460.6,1577.95,0.0,1765.55,317.799,340.38,0.0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,1,0.0,1,0,0,0,0.0,1.0,1.0,1.0,2.0,5.0,1.0,3.0,1.0,3.0,0.0,0.0,0.0,87.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
20,5000.0,5000.0,5000.0,0,14.99,173.31,C,C4,10.0,RENT,68000.0,Not Verified,2017-03-01,Fully Paid,debt_consolidation,945xx,CA,22.5,0.0,2003-04-01,0.0,1,0,6.0,0.0,10276.0,90.1,18.0,0,0.0,0.0,5168.906377,5168.91,5000.0,168.91,0.0,0.0,0.0,5003.93,0.0,0,0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,1,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1,1,1,1,0.0,1,0,1,0,0.0,4.0,4.0,4.0,6.0,8.0,4.0,9.0,4.0,6.0,0.0,0.0,0.0,94.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
21,10150.0,10150.0,10150.0,0,7.24,314.52,A,A3,8.0,MORTGAGE,50000.0,Not Verified,2017-03-01,Fully Paid,debt_consolidation,773xx,TX,29.6,0.0,2002-06-01,1.0,0,0,9.0,0.0,21845.0,56.0,21.0,1,0.0,0.0,10941.319984,10941.32,10150.0,791.32,0.0,0.0,0.0,6231.69,0.0,0,0,0.0,0.0,0.0,1.0,3.0,1.0,1.0,1,0.0,0.0,1.0,2.0,2.0,0.0,0.0,1,1,1,1,3.0,1,0,1,0,0.0,3.0,4.0,3.0,5.0,8.0,5.0,10.0,4.0,9.0,0.0,0.0,2.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
22,8400.0,8400.0,8400.0,0,11.39,276.56,B,B3,8.0,MORTGAGE,50000.0,Source Verified,2017-03-01,Charged Off,other,454xx,OH,15.63,0.0,2005-04-01,0.0,0,0,14.0,0.0,12831.0,30.3,30.0,1,0.0,0.0,1925.29,1925.29,1417.67,507.62,0.0,0.0,0.0,276.56,0.0,0,0,0.0,0.0,0.0,3.0,2.0,1.0,2.0,1,4.0,8.0,4.0,7.0,10.0,0.0,0.0,1,1,1,1,4.0,1,0,1,0,0.0,4.0,5.0,7.0,11.0,9.0,11.0,16.0,5.0,14.0,0.0,0.0,5.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
26,10000.0,10000.0,10000.0,0,12.74,335.69,C,C1,10.0,OWN,40000.0,Not Verified,2017-03-01,Fully Paid,debt_consolidation,324xx,FL,8.85,0.0,1997-03-01,0.0,0,0,7.0,0.0,9227.0,55.9,15.0,0,0.0,0.0,10970.713713,10970.71,10000.0,970.71,0.0,0.0,0.0,7963.66,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,3.0,0.0,1.0,3.0,0.0,0.0,1,1,1,1,2.0,1,0,1,0,0.0,2.0,4.0,2.0,3.0,2.0,7.0,11.0,4.0,7.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [54]:
# Save
df.to_csv("../data/data_cleaned_df.csv", index = False)

In [48]:
# show all predictors that still have at least one NaN
def show_NaN(df):
    check_isna = df.isna().sum()
    for i in zip(check_isna.index, check_isna.values):
        if i[1] != 0:
            print(i)

In [49]:
show_NaN(df)

In [59]:
df.isnull().any().sum()

0

In [53]:
# Clean just 2016/2017
df_2016_2017 = clean_up(data_2016_2017)

In [55]:
# Save
df_2016_2017.to_csv("../data/data_cleaned_df_2016_2017.csv", index = False)

In [60]:
df_2016_2017.isnull().any().sum()

0

<font color = "red">
 Got to this point - if we remove all of the rows with NaN it doesn't seem to be much of the data!
    
 ** Roland's original code below this point.... **

In [None]:
df[df['num_rev_accts'].isna()]

In [None]:
# remove some rows with NaN
df = df.drop(index=[407707, 1049426, 1065252])

In [None]:
# Fill title with values from purpose, where we can


In [None]:
show_NaN(df)

<font color=red>Ask team how to best treat above NaN predictors.</font>

<div class='exercise'><b> Part 2: Reducing number of predictors </b></div>


In [None]:
def make_scatterplot(x, y):
    fig = plt.figure(figsize=(8,6))
    plt.scatter(x, y, alpha=0.8)
    plt.title('Compare two variables', fontsize=16)
    plt.xlabel(x.name, fontsize=14)
    plt.ylabel(y.name, fontsize=14)
    plt.legend(loc='best')
    plt.show()

In [None]:
# When comparing predictors we can see that some of them are highly correlated, those should be removed from the data.
display(make_scatterplot(df['loan_amnt'], df['funded_amnt'])) # same for funded_amnt_inv
display(make_scatterplot(df['total_pymnt'], df['total_pymnt_inv']))

Based on the above analysis we can probably also remove the following columns:

In [None]:
cols_to_remove = ['funded_amnt', 'funded_amnt_inv', 'total_pymnt_inv']

In [None]:
df = df.drop(columns=cols_to_remove)
df.shape

<font color=red>Since sns.pairplot() runs forever with 100 predictors. The loop below compares one variable to all others but doesn't finish due to NaN in the data. What other ways are there to do this more efficiently?</font>

In [None]:
#for col in sample_df:
#    display(make_scatterplot(sample_df['loan_status'], sample_df[col]))

<div class='exercise'><b> Part 3: Exploring the data </b></div>


In [None]:
df.info(memory_usage='deep')

The original df is so big that we would want to sample maybe 10% of it but at the same time making sure we keep the proportions from "loan_status"

In [None]:
gr = df["loan_status"].value_counts().to_frame()
gr

Let's define ratios for splitting sample to make sure we get a similar distribution of fully paid and charged off observations.<br>
<font color=red>Not sure if there's an easier way to do this but this works.</font>

In [None]:
fp = gr.iloc[0,0] / len(df)
co = gr.iloc[1,0] / len(df)

# select 10% of the original df
fractions = {'Fully Paid': fp/10, 'Charged Off': co/10}
sample_df = pd.concat(df.sample(n=int(fractions.get(i) * len(df))) for i,dff in df.groupby('loan_status'))
grp = sample_df["loan_status"].value_counts().to_frame()
grp

In [None]:
sample_df.describe().T

<font color=red>We may be able to use PCA or forward/backward selection to reduce the number of columns but for that we would have to fit models. The function "step_forwards_backwards" below fails due to NaN in the data.</font>

In [None]:
data_train, data_test = train_test_split(sample_df, test_size=.2, stratify=sample_df['loan_status'], random_state=42)

In [None]:
X_train = sm.add_constant(data_train.drop('loan_status', axis=1))
X_test = sm.add_constant(data_test.drop('loan_status', axis=1))
y_train = data_train['loan_status'].values.reshape(-1,1)
y_test = data_test['loan_status'].values.reshape(-1,1)

Using the HW3 solution code for forward selection of predictors to narrow down the many predictors we currently have. This is using OLS and Bic to decide which predictors to keep. However, for this to work, there can't be any NaN in the data.

In [None]:
def step_forwards_backwards(df, y, direction='forward'):
    
    assert direction in ['forward', 'backward']

    predictors = set(df.columns)
    selected_predictors = set() if direction=='forward' else set(predictors)
    
    n = df.shape[0]
    best_bic = np.inf
    
    best_bics = []
    best_models = []
    
    if direction == 'forward':
        X = np.ones(n).reshape(-1,1)
        X = np.concatenate([X, df[list(selected_predictors)].values], axis=1)
        while (True):
            
            possible_bic_scores = []
            possible_predictors = list(selected_predictors ^ predictors)
            
            if len(possible_predictors) == 0:
                break
                
            for predictor in possible_predictors:
                
                x_temp = np.concatenate([X, df[predictor].values.reshape(-1,1)], axis=1)
                model = OLS(endog=y, exog=x_temp).fit()
                bic = model.bic
                possible_bic_scores.append(bic)
                
            best_predictor_ix = np.argmin(possible_bic_scores)
            best_predictor = possible_predictors[best_predictor_ix]
            
            best_bic = np.min(possible_bic_scores)
            best_bics.append(best_bic)
            
            selected_predictors.add(best_predictor)            
            X = np.concatenate([X, df[best_predictor].values.reshape(-1,1)], axis=1)
            best_models.append(list(selected_predictors))

    else:

        while (True):
            possible_bic_scores = []
            possible_predictors = list(selected_predictors)

            if len(possible_predictors) == 0:
                break

            for predictor in possible_predictors:
                X = np.concatenate([np.ones(n).reshape(-1,1), 
                                    df[list(selected_predictors - set([predictor]))].values], axis=1)
                model = OLS(endog=y, exog=X).fit()
                bic = model.bic
                possible_bic_scores.append(bic)

            best_predictor_ix = np.argmin(possible_bic_scores)
            best_predictor = possible_predictors[best_predictor_ix] 

            best_bic = possible_bic_scores[best_predictor_ix]
            selected_predictors.discard(best_predictor)
            
            best_bics.append(best_bic)
            best_models.append(list(selected_predictors))
            
    index_of_best_bic = np.argmin(best_bics)

    return best_models[index_of_best_bic]

In [None]:
#predictors_forward = step_forwards_backwards(X_train, y_train, direction='forward')