In [1]:
import pandas as pd

In [2]:
loan_data = pd.read_csv('datasets/loan_small.csv')
display(loan_data) # Display the data


Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,,5849.0,0.0,,urban,Y
1,LP001003,Male,4583.0,,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,
4,LP001008,Male,,0.0,141.0,urban,Y
5,LP001011,Male,5417.0,4196.0,267.0,semi,Y
6,LP001013,Male,2333.0,1516.0,,rural,Y
7,LP001014,Female,3036.0,2504.0,158.0,semi,N
8,LP001018,Male,4006.0,1526.0,168.0,rural,Y
9,LP001020,Male,12841.0,10968.0,349.0,semi,N


In [3]:
# Check for exact duplicated rows in the data 
loan_data.duplicated().sum()

np.int64(0)

In [5]:
# Check for duplicated rows in case there are any
# Note - It will return only the duplicated rows from the entire dataset
loan_data.loc[loan_data.duplicated(), :]

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status


In [6]:
# Dropping duplicate rows if any by keeping only 'first' or 'last'
loan_data.drop_duplicates(keep='first', inplace=True)

In [None]:
Q1 = loan_data.select_dtypes(include='number').quantile(0.25, axis = 0) # 25% of the data

print('First_Quartile :\n', Q1)

In [None]:
Q3 = loan_data.select_dtypes(include='number').quantile(0.75, axis = 0) # 75% of the data

print('Third_Quartile :\n', Q3)

The 25th(Q1) and 75th(Q3) quartiles are obtained and Inter quartile range(IQR) is obtained from Q3 and Q1

In [None]:
IQR = Q3 - Q1

print('Inter Quartile Range :\n', IQR)

Outliers are detected using IQR method i.e An outlier is a point which falls more than 1.5 times the interquartile range above the third quartile or below the first quartile.


In [None]:
# Only the outlier rows are displayed 
loan_data_outliers = loan_data[
    (
        (loan_data.select_dtypes(include='number') < (Q1 - 1.5 * IQR)) 
        |
        (loan_data.select_dtypes(include='number') > (Q3 + 1.5 * IQR))
    ).any(axis = 1)
]

loan_data_outliers

In [None]:
print(f"Initial shape: {loan_data.shape}")

The final dataframe is obtained after removing the outlier rows

In [None]:
loan_data = loan_data[
    ~(
        (loan_data.select_dtypes('number') < (Q1 - 1.5 * IQR)) 
        |
        (loan_data.select_dtypes('number') > (Q3 + 1.5 * IQR))
    ).any(axis = 1)
]

print(f"Shape after removing outliers: {loan_data.shape}")

In [None]:
loan_data.isnull().sum() # Check for missing values

For numbers - Either remove records with null values or substitute them using mean of the column. Since its a small dataset, we can go ahead with substituting with mean value of the column.

In [None]:
loan_data.select_dtypes('number').fillna(loan_data.select_dtypes('number').mean()) 
# Fill missing values with the mean

Loan_ID is not of any importance, so we drop it

In [54]:
loan_data = loan_data.drop(['Loan_ID'], axis=1)

For categorical columns<br>

Option 1- Convert to dummy variables. `pandas.get_dummies()` is a function in the Pandas library that converts categorical data (variables with a fixed number of distinct categories) into a format that can be provided to machine learning models. This process is called one-hot encoding. It takes a column with categorical values (e.g., ["Male", "Female"]) and creates separate columns for each unique category (e.g., Male, Female), with binary values (1/True or 0/False) indicating the presence of that category in the data.<br>

Option 2 - Use most frequently occuring value to fill i.e. mode()<br>

Here will go ahead with option 1<br>


In [None]:
for_temporary_observation = pd.get_dummies(loan_data)
print(for_temporary_observation) # Display the data

If you're using one-hot encoding in regression or other linear models, you can set drop_first=True to avoid multicollinearity (avoiding redundancy)

In [None]:
for_temporary_observation = pd.get_dummies(loan_data, drop_first=True)
print(for_temporary_observation) # Display the data

As you notice, even if we have used `drop_first=True` to avoid redundancy, both the Gender_Male and Gender_Female columns still exists (only one should be present). Similarly, for Loan_Status, Area

`pd.get_dummies` may not have correctly recognized it as a categorical column. Instead, it could have treated it as a set of independent binary columns (Gender_Female and Gender_Male), preventing one from being dropped.

In [57]:
loan_data['Gender'] = loan_data['Gender'].astype('category')
loan_data['Loan_Status'] = loan_data['Loan_Status'].astype('category')
loan_data['Area'] = loan_data['Area'].astype('category')

In [None]:
loan_data = pd.get_dummies(loan_data, drop_first=True)
print(loan_data)

In [60]:
loan_data.dropna(inplace=True) # Drop missing values

In [None]:
display(loan_data) # Display the data

In [None]:
loan_data.describe()