# Feature Engineering

In [11]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [12]:
df = pd.read_csv('../data/interim/credit-score.csv')
df.head()

Unnamed: 0,ID,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,Credit_Score,Credit_History_Age_Formated
0,0x1602,CUS_0xd40,January,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,11.27,4.0,Good,809.98,26.82262,No,49.574949,80.415295,312.494089,Good,265.0
1,0x1603,CUS_0xd40,February,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,6.0,11.27,4.0,Good,809.98,31.94496,No,49.574949,118.280222,284.629162,Good,266.0
2,0x1604,CUS_0xd40,March,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,10.27,4.0,Good,809.98,28.609352,No,49.574949,81.699521,331.209863,Good,267.0
3,0x1605,CUS_0xd40,April,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4.0,6.27,4.0,Good,809.98,31.377862,No,49.574949,199.458074,223.45131,Good,268.0
4,0x1606,CUS_0xd40,May,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,6.0,11.27,4.0,Good,809.98,24.797347,No,49.574949,41.420153,341.489231,Good,269.0


I want to read the column `Type_of_loan` and separate all loans, so I can make the dummies. First I collect the different types of loans in an array, then I just check if a customer has that loan, if they do the loan column will receive a value of 1, otherwise is 0.

In [13]:
loan_types = []
for index in df.index:
    temp = df.Type_of_Loan[index].replace('and ', '').split(', ')
    for i in temp: #loan in temp array
        if i not in loan_types: #if loan is not in loan_types
            loan_types.append(i) #add it

In [14]:
for loan in loan_types:
    df[loan.replace(' ', '_')] = 0 #create the loan column in the df with 0
    for index in df.index:
        temp = df.Type_of_Loan[index].replace('and ', '').split(', ')
        if loan in temp:
            df.loc[index, loan.replace(' ', '_')] = 1

In [15]:
df.iloc[:,-9:].head()

Unnamed: 0,Auto_Loan,Credit-Builder_Loan,Personal_Loan,Home_Equity_Loan,Not_Specified,Mortgage_Loan,Student_Loan,Debt_Consolidation_Loan,Payday_Loan
0,1,1,1,1,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0
2,1,1,1,1,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0
4,1,1,1,1,0,0,0,0,0


## Replacing values

Instead of making dummies out of the months, I convert them to their numerical counterpart.

In [16]:
month = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8
}
df.Month.replace(month, inplace=True)
df.Month = df.Month.astype(int)

`credit_mix` and `credit_score` are ordinal categories, so I'll be replacing their categories with -1, 0, and 1, in order.

In [17]:
credit_mix = {
    'Standard': 0,
    'Good': 1,
    'Bad': -1
}
df.Credit_Mix.replace(credit_mix, inplace=True)
df.Credit_Mix = df.Credit_Mix.astype(int)

In [18]:
credit_score = {
    'Standard': 0,
    'Poor': -1,
    'Good': 1
}
df.Credit_Score.replace(credit_score, inplace=True)
df.Credit_Score = df.Credit_Score.astype(int)

I'm creating a new feature `Missing_Payment_Day`, to indicate if the payment was missed or not, ignoring how many days it was late.

In [19]:
df['Missed_Payment_Day'] = 0
for row in df.index:
    if df.loc[row, 'Delay_from_due_date'] > 0:
        df.loc[row, 'Missed_Payment_Day'] = 1

Just save the dataset so we can start with the new features in the next notebook!

In [20]:
df.to_csv('../data/processed/credit-score.csv', index=False)