In [1]:
import pandas as pd
import numpy as np

In [2]:
# read imputed data
train = pd.read_csv('../../data/processed/imputed_data/train_imputed.csv')
test = pd.read_csv('../../data/processed/imputed_data/test_imputed.csv')

# save the length of the train data
train_len = len(train)

# concatenate train and test data
data = pd.concat([train, test], sort=False)

# drop the Unnamed column
data = data.drop(columns = ['Unnamed: 0'])

In [3]:
data.columns

Index(['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents', 'SeriousDlqin2yrs'],
      dtype='object')

In [4]:
# print the number of unique values in each column
for col in data.columns:
    print(col, data[col].nunique())

RevolvingUtilizationOfUnsecuredLines 125728
age 89
NumberOfTime30-59DaysPastDueNotWorse 16
DebtRatio 114194
MonthlyIncome 147820
NumberOfOpenCreditLinesAndLoans 58
NumberOfTimes90DaysLate 19
NumberRealEstateLoansOrLines 28
NumberOfTime60-89DaysPastDueNotWorse 13
NumberOfDependents 3834
SeriousDlqin2yrs 2


Sure, here's how you can explain each feature in markdown:

1. **RevolvingUtilizationOfUnsecuredLines**: This feature represents the ratio of the total amount of credit used by the borrower to the total credit limit across all unsecured lines of credit, such as credit cards. It indicates how much of the available credit a borrower is utilizing.

2. **age**: This feature denotes the age of the borrower. Age can be a significant factor in assessing credit risk, as younger individuals might have less stable financial situations or credit history compared to older individuals.

3. **NumberOfTime30-59DaysPastDueNotWorse**: This feature counts the number of times a borrower has been 30-59 days past due on any credit account, but the delinquency hasn't worsened.

4. **DebtRatio**: The debt ratio is calculated by dividing the total monthly debt payments by the borrower's total monthly income. It provides insight into how much of a borrower's income is being used to repay debts, which is crucial for assessing their financial health.

5. **MonthlyIncome**: This feature represents the monthly income of the borrower. Monthly income is a fundamental factor in determining a borrower's ability to repay debts.

6. **NumberOfOpenCreditLinesAndLoans**: It counts the total number of open credit lines and loans that a borrower has, including credit cards, installment loans, and mortgages. This can reflect the borrower's credit utilization and their ability to manage multiple credit accounts.

7. **NumberOfTimes90DaysLate**: Similar to the 'NumberOfTime30-59DaysPastDueNotWorse', this feature counts the number of times a borrower has been 90 days or more past due on any credit account.

8. **NumberRealEstateLoansOrLines**: This feature represents the number of real estate loans or lines of credit that the borrower has. Real estate loans include mortgages and home equity lines of credit.

9. **NumberOfTime60-89DaysPastDueNotWorse**: Similar to the 'NumberOfTime30-59DaysPastDueNotWorse' and 'NumberOfTimes90DaysLate', this feature counts the number of times a borrower has been 60-89 days past due on any credit account.

10. **NumberOfDependents**: This feature indicates the number of dependents that the borrower financially supports. It can be a factor in assessing the borrower's financial stability and their ability to manage additional financial responsibilities.

In [5]:
# add an income for dependent column
data['income_dependent'] = data['MonthlyIncome'] / (data['NumberOfDependents'] + 1)

In [6]:
# add a delinquency ratio column: 
# DelinquencyRatio = (NumberOfTime30-59DaysPastDueNotWorse + NumberOfTimes90DaysLate + NumberOfTime60-89DaysPastDueNotWorse) / NumberOfOpenCreditLinesAndLoans

data['DelinquencyRatio'] = (data['NumberOfTime30-59DaysPastDueNotWorse'] + data['NumberOfTimes90DaysLate'] + data['NumberOfTime60-89DaysPastDueNotWorse']) / (data['NumberOfOpenCreditLinesAndLoans'] + 1)

In [7]:
# split the data back into train and test
train = data.iloc[:train_len]
test = data.iloc[train_len:]

# save the data
train.to_csv('../../data/processed/engineered_features_data/train_imputed_engineered.csv', index=False)
test.to_csv('../../data/processed/engineered_features_data/test_imputed_engineered.csv', index=False)

In [8]:
from sklearn.preprocessing import PolynomialFeatures

In [9]:
# create a PolynomialFeatures object
poly = PolynomialFeatures(degree=2, include_bias=False)

features_to_poly = ['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']

# fit transform the data
data_poly = poly.fit_transform(data[features_to_poly].copy())

# create a new dataframe with the polynomial features
data_poly = pd.DataFrame(data_poly, columns=poly.get_feature_names_out(features_to_poly))

# merge the data with the non-polynomial features
data_non_poly = data.drop(columns=features_to_poly)

data_poly[data_non_poly.columns] = data_non_poly.values

In [10]:
data_poly.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,...,NumberOfTimes90DaysLate NumberOfDependents,NumberRealEstateLoansOrLines^2,NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse,NumberRealEstateLoansOrLines NumberOfDependents,NumberOfTime60-89DaysPastDueNotWorse^2,NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents,NumberOfDependents^2,SeriousDlqin2yrs,income_dependent,DelinquencyRatio
0,0.034949,59.0,0.0,0.004933,7959.688894,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7959.688894,0.0
1,0.155308,47.0,0.0,881.0,1468.106859,6.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1468.106859,0.0
2,0.165166,62.0,1.0,0.020327,2851.722407,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2851.722407,0.111111
3,0.010886,61.0,0.0,0.642979,1115.657341,6.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1115.657341,0.0
4,0.000717,49.0,0.0,3603.0,1674.378775,15.0,0.0,3.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,1674.378775,0.0


In [11]:
# split the data back into train and test
train = data_poly.iloc[:train_len]
test = data_poly.iloc[train_len:]

# save the data
train.to_csv('../../data/processed/engineered_features_data/train_imputed_engineered_poly.csv', index=False)
test.to_csv('../../data/processed/engineered_features_data/test_imputed_engineered_poly.csv', index=False)

In [12]:
data_poly.isna().sum()

RevolvingUtilizationOfUnsecuredLines                           0
age                                                            0
NumberOfTime30-59DaysPastDueNotWorse                           0
DebtRatio                                                      0
MonthlyIncome                                                  0
                                                           ...  
NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents        0
NumberOfDependents^2                                           0
SeriousDlqin2yrs                                           37500
income_dependent                                               0
DelinquencyRatio                                               0
Length: 68, dtype: int64