# Instructor Do: Dealing with Categorical Data in ML

In [1]:
# initial imports
import pandas as pd
from path import Path
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder


## Dataset Information

The file `loans_data.csv`, contains simulated data about loans, there are a total of 500 records. Each row represents a loan application along an arbitrary year, where every column represents the following data about every loan application.

* `amount`: The loan amount in USD.
* `term`: The loan term in months.
* `month`: The month of the year when the loan was requested.
* `age`: Age of the loan applicant.
* `education`: Educational level of the loan applicant.
* `gender`: Gender of the loan applicant.
* `bad`: Stands for a bad or good loan applicant (`1` - bad, `0` - good).

In [2]:
# Load data
file_path = Path("../Resources/loans_data.csv")
loans_df = pd.read_csv(file_path)
loans_df.head()

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,High School or Below,male,0
1,1000,30,July,50,Bachelor,female,0
2,1000,30,August,33,Bachelor,female,0
3,1000,15,September,27,college,male,0
4,1000,30,October,28,college,female,0


In [3]:
loans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   amount     500 non-null    int64 
 1   term       500 non-null    int64 
 2   month      500 non-null    object
 3   age        500 non-null    int64 
 4   education  500 non-null    object
 5   gender     500 non-null    object
 6   bad        500 non-null    int64 
dtypes: int64(4), object(3)
memory usage: 27.5+ KB


In [4]:
# Binary encoding using Pandas (single column)
# This converts non-numerical data into numerical data to be used by Scikit
loans_binary_encoded = pd.get_dummies(loans_df, columns=["gender"])
loans_binary_encoded.head()

Unnamed: 0,amount,term,month,age,education,bad,gender_female,gender_male
0,1000,30,June,45,High School or Below,0,0,1
1,1000,30,July,50,Bachelor,0,1,0
2,1000,30,August,33,Bachelor,0,1,0
3,1000,15,September,27,college,0,0,1
4,1000,30,October,28,college,0,1,0


In [5]:
loans_binary_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   amount         500 non-null    int64 
 1   term           500 non-null    int64 
 2   month          500 non-null    object
 3   age            500 non-null    int64 
 4   education      500 non-null    object
 5   bad            500 non-null    int64 
 6   gender_female  500 non-null    uint8 
 7   gender_male    500 non-null    uint8 
dtypes: int64(4), object(2), uint8(2)
memory usage: 24.5+ KB


In [6]:
# Binary encoding using Pandas (multiple columns)
loans_binary_encoded = pd.get_dummies(loans_df, columns=["education", "gender"])
loans_binary_encoded.head()

Unnamed: 0,amount,term,month,age,bad,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,June,45,0,0,1,0,0,0,1
1,1000,30,July,50,0,1,0,0,0,1,0
2,1000,30,August,33,0,1,0,0,0,1,0
3,1000,15,September,27,0,0,0,0,1,0,1
4,1000,30,October,28,0,0,0,0,1,1,0


In [12]:
le = LabelEncoder()
df2 = loans_df.copy()
df2['education'] = le.fit_transform(df2['education']) 
df2['gender'] = le.fit_transform(df2['gender']) 

df2.head()

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,1,1,0
1,1000,30,July,50,0,0,0
2,1000,30,August,33,0,0,0
3,1000,15,September,27,3,1,0
4,1000,30,October,28,3,0,0


# Custom Encoding

In [13]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()
loans_df["month_le"] = label_encoder.fit_transform(loans_df["month"])
loans_df.head()

Unnamed: 0,amount,term,month,age,education,gender,bad,month_le
0,1000,30,June,45,High School or Below,male,0,6
1,1000,30,July,50,Bachelor,female,0,5
2,1000,30,August,33,Bachelor,female,0,1
3,1000,15,September,27,college,male,0,11
4,1000,30,October,28,college,female,0,10


In [14]:
# Months dictionary
months_num = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}



In [15]:
# Months' names encoded using the dictionary values
loans_df["month_num"] = loans_df["month"].apply(lambda x: months_num[x])
loans_df.head()



Unnamed: 0,amount,term,month,age,education,gender,bad,month_le,month_num
0,1000,30,June,45,High School or Below,male,0,6,6
1,1000,30,July,50,Bachelor,female,0,5,7
2,1000,30,August,33,Bachelor,female,0,1,8
3,1000,15,September,27,college,male,0,11,9
4,1000,30,October,28,college,female,0,10,10


In [16]:
# Drop the month and month_le columns
loans_df = loans_df.drop(["month", "month_le"], axis=1)
loans_df.head()

Unnamed: 0,amount,term,age,education,gender,bad,month_num
0,1000,30,45,High School or Below,male,0,6
1,1000,30,50,Bachelor,female,0,7
2,1000,30,33,Bachelor,female,0,8
3,1000,15,27,college,male,0,9
4,1000,30,28,college,female,0,10


In [17]:
loans_df['education'] = le.fit_transform(loans_df['education'])
loans_df['gender'] = le.fit_transform(loans_df['gender'])
loans_df.head()

Unnamed: 0,amount,term,age,education,gender,bad,month_num
0,1000,30,45,1,1,0,6
1,1000,30,50,0,0,0,7
2,1000,30,33,0,0,0,8
3,1000,15,27,3,1,0,9
4,1000,30,28,3,0,0,10


In [18]:
loans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   amount     500 non-null    int64
 1   term       500 non-null    int64
 2   age        500 non-null    int64
 3   education  500 non-null    int64
 4   gender     500 non-null    int64
 5   bad        500 non-null    int64
 6   month_num  500 non-null    int64
dtypes: int64(7)
memory usage: 27.5 KB
