In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os

path = "/content/drive/MyDrive/data-science-30-day-portfolio/week1_finance_foundations/day03_feature_engineering"
os.makedirs(path, exist_ok=True)
path


'/content/drive/MyDrive/data-science-30-day-portfolio/week1_finance_foundations/day03_feature_engineering'

In [9]:
import pandas as pd
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("yasserh/loan-default-dataset")

# Construct the full path to the CSV file within the downloaded directory
df_path = os.path.join(path, 'Loan_Default.csv')

# Load the dataset into a pandas DataFrame
df = pd.read_csv(df_path)

print("Path to dataset files:", path)
print("DataFrame loaded successfully. First 5 rows:")
print(df.head())

Using Colab cache for faster access to the 'loan-default-dataset' dataset.
Path to dataset files: /kaggle/input/loan-default-dataset
DataFrame loaded successfully. First 5 rows:
      ID  year loan_limit             Gender approv_in_adv loan_type  \
0  24890  2019         cf  Sex Not Available         nopre     type1   
1  24891  2019         cf               Male         nopre     type2   
2  24892  2019         cf               Male           pre     type1   
3  24893  2019         cf               Male         nopre     type1   
4  24894  2019         cf              Joint           pre     type1   

  loan_purpose Credit_Worthiness open_credit business_or_commercial  ...  \
0           p1                l1        nopc                  nob/c  ...   
1           p1                l1        nopc                    b/c  ...   
2           p1                l1        nopc                  nob/c  ...   
3           p4                l1        nopc                  nob/c  ...   
4        

In [11]:
y = df['Status']

In [13]:
X = df.drop('Status', axis=1)
X.isnull().sum()

Unnamed: 0,0
ID,0
year,0
loan_limit,3344
Gender,0
approv_in_adv,908
loan_type,0
loan_purpose,134
Credit_Worthiness,0
open_credit,0
business_or_commercial,0


In [16]:
from sklearn.impute import SimpleImputer

num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(include='object').columns

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat = encoder.fit_transform(X[cat_cols])

encoded_cat_df = pd.DataFrame(
    encoded_cat,
    columns=encoder.get_feature_names_out(cat_cols)
)

X_final = pd.concat(
    [pd.DataFrame(X[num_cols].values, columns=num_cols), encoded_cat_df],
    axis=1
)

X_final.head()

Unnamed: 0,ID,year,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,...,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
0,24890.0,2019.0,116500.0,3.99,0.3904,2596.45,360.0,118000.0,1740.0,758.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,24891.0,2019.0,206500.0,3.99,0.3904,2596.45,360.0,418000.0,4980.0,552.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,24892.0,2019.0,406500.0,4.56,0.2,595.0,360.0,508000.0,9480.0,834.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,24893.0,2019.0,456500.0,4.25,0.681,2596.45,360.0,658000.0,11880.0,587.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,24894.0,2019.0,696500.0,4.0,0.3042,0.0,360.0,758000.0,10440.0,602.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


## Feature Engineering Notes

### Decisions Made
- Missing numerical values were filled using median.
- Categorical variables were encoded using One-Hot Encoding.
- Features were scaled using StandardScaler.

### Assumptions
- Median imputation does not distort financial distributions.
- One-hot encoding captures categorical impact adequately.

### Limitations
- Feature interactions not explicitly modeled.
- Encoding increases dimensionality.

### Future Improvements
- Use pipelines to avoid data leakage.
- Explore feature interaction terms.
