In [233]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

01.Read the attached CSV file which contains the customer data for a bank’s loan approval, where the target is whether the loan was approved or not.
Columns:
    ID: Unique identifier,
    Age: Age of the applicant,
    Income: Monthly income in INR,
    Credit_Score: Creditworthiness on a scale of 300 to 900,
    Loan_Approved: 1 for approved, 0 for not approved (imbalance: 70% not approved)

In [235]:
df = pd.read_csv("C:/Users/Arjun/06_JN/loan_approved_data.csv")
df.head()

Unnamed: 0,ID,Age,Income,Credit_Score,Loan_Approved
0,1,25,30000,650,0
1,2,32,50000,700,0
2,3,45,80000,750,1
3,4,28,35000,600,0
4,5,50,120000,800,1


02.Use random sampling of 10 samples with replacement from the dataset 

In [237]:
Random_sampling_with_replacement = df.sample(n=10, replace=True)
print("Random sampling with replacement:\n", Random_sampling_with_replacement)

Random sampling with replacement:
     ID  Age  Income  Credit_Score  Loan_Approved
16  17   30   45000           680              0
16  17   30   45000           680              0
3    4   28   35000           600              0
7    8   38   65000           680              0
10  11   27   32000           620              0
14  15   26   31000           640              0
10  11   27   32000           620              0
3    4   28   35000           600              0
1    2   32   50000           700              0
7    8   38   65000           680              0


03.Use random sampling of 5 samples without replacement from the dataset

In [239]:
Random_sampling_without_replacement = df.sample(n=5, replace=False)
print("Random sampling without replacement:\n", Random_sampling_without_replacement)

Random sampling without replacement:
     ID  Age  Income  Credit_Score  Loan_Approved
16  17   30   45000           680              0
0    1   25   30000           650              0
8    9   48   90000           780              1
17  18   36   62000           700              0
10  11   27   32000           620              0


04.Use systematic sampling of every 4th element 

In [241]:
systematic_sample = df.iloc[::4]
print("Systematic sampling:\n", systematic_sample)

Systematic sampling:
     ID  Age  Income  Credit_Score  Loan_Approved
0    1   25   30000           650              0
4    5   50  120000           800              1
8    9   48   90000           780              1
12  13   35   60000           710              0
16  17   30   45000           680              0


(a).High cibil score (750 and above)

In [244]:
high_cibil = df[df['Credit_Score'] >= 750]
clustered_sample_high = high_cibil.sample(n=2, replace=False)
print("Clustered sampling (High CIBIL):\n", clustered_sample_high)

Clustered sampling (High CIBIL):
    ID  Age  Income  Credit_Score  Loan_Approved
4   5   50  120000           800              1
2   3   45   80000           750              1


(b).Medium cibil score (650 to 749)

In [246]:
medium_cibil = df[(df['Credit_Score'] >= 650) & (df['Credit_Score'] < 750)]
clustered_sample_medium = medium_cibil.sample(n=2, replace=False)
print("Clustered sampling (Medium CIBIL):\n", clustered_sample_medium)

Clustered sampling (Medium CIBIL):
    ID  Age  Income  Credit_Score  Loan_Approved
0   1   25   30000           650              0
6   7   33   55000           720              0


(c).Low cibil score (less than 650)

In [248]:
low_cibil = df[df['Credit_Score'] < 650]
clustered_sample_low = low_cibil.sample(n=2, replace=False)
print("Clustered sampling (Low CIBIL):\n", clustered_sample_low)

Clustered sampling (Low CIBIL):
     ID  Age  Income  Credit_Score  Loan_Approved
14  15   26   31000           640              0
10  11   27   32000           620              0


06.From the dataset , it is clear the approval vs not approval of the loan follows the 70:30 ratio, which is an imbalance. 
Solve the imbalance using:

In [250]:
# Handling imbalance
X = df.drop('Loan_Approved', axis=1)
y = df['Loan_Approved']

(a).Random oversampling

In [252]:
ros = RandomOverSampler(random_state=42)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X, y)
df_resampled_ros = pd.concat([X_resampled_ros, y_resampled_ros], axis=1)
print("Random oversampling:\n", df_resampled_ros['Loan_Approved'].value_counts())

Random oversampling:
 Loan_Approved
0    14
1    14
Name: count, dtype: int64


(b).Random undersampling

In [254]:
rus = RandomUnderSampler(random_state=42)
X_resampled_rus, y_resampled_rus = rus.fit_resample(X, y)
df_resampled_rus = pd.concat([X_resampled_rus, y_resampled_rus], axis=1)
print("Random undersampling:\n", df_resampled_rus['Loan_Approved'].value_counts())

Random undersampling:
 Loan_Approved
0    6
1    6
Name: count, dtype: int64


(c).SMOTE

In [256]:
smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)
df_resampled_smote = pd.concat([X_resampled_smote, y_resampled_smote], axis=1)
print("SMOTE:\n", df_resampled_smote['Loan_Approved'].value_counts())

SMOTE:
 Loan_Approved
0    14
1    14
Name: count, dtype: int64
