In [1]:
# Step 1 : Install Pandas
!pip install pandas



In [2]:
# import required libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline
# import plotly.express as px
# import plotly.graph_objects as go
# import plotly.io as pio
# import itertools



In [3]:
from sklearn.model_selection import train_test_split , GridSearchCV, KFold, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay # Import ConfusionMatrixDisplay
from sklearn import metrics

In [4]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [5]:
import warnings

warnings.filterwarnings('ignore')

**Data Understanding**

In [6]:
# Step 3 : import data
data_df = pd.read_csv('loan_data_2007_2014.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'loan_data_2007_2014.csv'

In [None]:
# display first 5 rows
data_df.head()

In [None]:
data_df.tail()

In [None]:
# display info
data_df.info()

In [None]:
# mengecek variabel yang masih terdapat nilai null
data_df.isnull().sum()

**Exploratory Data Analysis**

In [None]:
data_df.shape

In [None]:
data_df.head()

In [None]:
col_names = data_df.columns
col_names

In [None]:
# find categorical variables

categorical = [var for var in data_df.columns if data_df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :', categorical)

In [None]:
# view the categorical variables

data_df[categorical].head()


*   ada informasi variabel "term" yang menjelaskan tentang jangka waktu pelunasan peminjaman
*   ada 9 variabel categorical yang diantaranya term, grade, sub_grade, emp_length, home_ownership, verification_status, loan_status, purpose, last_credit_pull_d
*   terdapat 3 variabel categorical binary: home_ownership, verification_status, loan_status
*   loan_status adalah variabel target/dependen





Explore problems within categorical variables



*   missing values in categorical variabel



In [None]:
# check missing values in categorical variables

data_df[categorical].isnull().sum()


In [None]:
# print categorical variables containing missing values

cat1 = [var for var in categorical if data_df[var].isnull().sum()!=0]

print(data_df[cat1].isnull().sum())

In [None]:
# view frequency of categorical variables

for var in categorical:

    print(data_df[var].value_counts())

In [None]:
# view frequency distribution of categorical variables

for var in categorical:

    print(data_df[var].value_counts()/float(len(data_df))) # Change np.float to float

In [None]:
# check for cardinality in categorical variables

for var in categorical:

    print(var, ' contains ', len(data_df[var].unique()), ' labels')

In [None]:
# find numerical variables

numerical = [var for var in data_df.columns if data_df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :', numerical)

In [None]:
# print numerical variables containing missing values

cat1 = [var for var in numerical if data_df[var].isnull().sum()!=0]

print(data_df[cat1].isnull().sum())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Misalkan 'df' adalah DataFrame yang memuat dataset Anda
sns.histplot(data_df['loan_amnt'], bins=30, kde=True)
plt.title('Distribusi Jumlah Pinjaman')
plt.xlabel('Jumlah Pinjaman')
plt.ylabel('Frekuensi')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Sampling data untuk mengurangi kepadatan (contoh: ambil 20% dari total data)
data_df_sampled = data_df.sample(frac=0.2, random_state=42)

# Membuat scatter plot dengan pengaturan yang lebih mudah dibaca
plt.figure(figsize=(10, 6))
sns.scatterplot(x='loan_amnt',
                y='int_rate',
                data=data_df_sampled,
                alpha=0.5,           # Transparansi untuk mengurangi overlap
                s=30,                # Ukuran marker lebih kecil
                color='blue')         # Warna yang kontras dan mudah dibaca

# Menambah garis tren untuk menunjukkan pola umum
sns.regplot(x='loan_amnt',
            y='int_rate',
            data=data_df_sampled,
            scatter=False,
            color='red',
            line_kws={'linewidth': 2})

# Menambah judul dan label
plt.title('Hubungan antara Jumlah Pinjaman dan Tingkat Bunga')
plt.xlabel('Jumlah Pinjaman')
plt.ylabel('Tingkat Bunga (%)')

# Menampilkan plot
plt.show()


In [None]:
sns.boxplot(x='loan_status', y='loan_amnt', data=data_df)
plt.title('Distribusi Jumlah Pinjaman berdasarkan Status Pinjaman')
plt.xlabel('Status Pinjaman')
plt.ylabel('Jumlah Pinjaman')
plt.show()

In [None]:
sns.barplot(x='int_rate', y='purpose', data=data_df, estimator=np.mean)
plt.title('Rata-rata Tingkat Bunga per Tujuan Pinjaman')
plt.xlabel('Rata-rata Tingkat Bunga (%)')
plt.ylabel('Tujuan Pinjaman')
plt.show()

In [None]:
sns.pairplot(data_df[['loan_amnt', 'int_rate', 'annual_inc', 'dti']])
plt.suptitle('Pair Plot Variabel Utama', y=1.02)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data_df['dti'], bins=30, kde=True)
plt.title('Distribution of Debt-to-Income Ratio (DTI)')
plt.xlabel('Debt-to-Income Ratio')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(y='purpose', data=data_df, order=data_df['purpose'].value_counts().index)
plt.title('Number of Borrowers by Loan Purpose')
plt.xlabel('Number of Borrowers')
plt.ylabel('Loan Purpose')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='verification_status', data=data_df)
plt.title('Number of Borrowers by Verification Status')
plt.xlabel('Verification Status')
plt.ylabel('Number of Borrowers')
plt.grid()
plt.show()

Data Preparation

In [None]:
data_df = data_df.dropna(axis=1, how='all')

In [None]:
data_df.info()

In [None]:
data_df.drop(['application_type'], axis=1, inplace=True)
data_df.drop(['zip_code'], axis=1, inplace=True)
data_df.drop(['desc'], axis=1, inplace=True)
data_df.drop(['title'], axis=1, inplace=True)
data_df.drop(['pymnt_plan'], axis=1, inplace=True)
data_df.drop(['member_id'], axis=1, inplace=True)
data_df.drop(['id'], axis=1, inplace=True)
data_df.drop(['Unnamed: 0'], axis=1, inplace=True)
data_df.drop(['url'], axis=1, inplace=True)

In [None]:
data_df.info()

FEATURE ENGINEERING

In [None]:
data_df['issue_d'].dtypes
data_df['last_pymnt_d'].dtypes
data_df['next_pymnt_d'].dtypes
data_df['last_credit_pull_d'].dtypes
data_df['earliest_cr_line'].dtypes

In [None]:
# Mengonversi ke datetime

data_df['issue_d'] = pd.to_datetime(data_df['issue_d'], errors='coerce', format='%b-%y')
data_df['last_pymnt_d'] = pd.to_datetime(data_df['last_pymnt_d'], errors='coerce', format='%b-%y')
data_df['next_pymnt_d'] = pd.to_datetime(data_df['next_pymnt_d'], errors='coerce', format='%b-%y')
data_df['last_credit_pull_d'] = pd.to_datetime(data_df['last_credit_pull_d'], errors='coerce', format='%b-%y')
data_df['earliest_cr_line'] = pd.to_datetime(data_df['earliest_cr_line'], errors='coerce', format='%b-%y')


In [None]:
# Mengekstrak bulan dan tahun

data_df['issue_d_month'] = data_df['issue_d'].dt.month
data_df['last_pymnt_d_month'] = data_df['last_pymnt_d'].dt.month
data_df['next_pymnt_d_month'] = data_df['next_pymnt_d'].dt.month
data_df['last_credit_pull_d_month'] = data_df['last_credit_pull_d'].dt.month
data_df['earliest_cr_line_month'] = data_df['earliest_cr_line'].dt.month


In [None]:
# Daftar kolom yang ingin ditampilkan
columns_to_display = ['issue_d_month', 'last_pymnt_d_month', 'next_pymnt_d_month', 'last_credit_pull_d_month', 'earliest_cr_line_month']

In [None]:
# Menampilkan hanya kolom yang ditentukan
selected_data = data_df[columns_to_display]

# Tampilkan hasil
print(selected_data)

In [None]:
data_df.info()

Labelling Variable Target

In [None]:
data_df['loan_status'].value_counts()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Membuat DataFrame contoh
data = {
    'loan_status': [
        'Current',
        'Fully Paid',
        'Charged Off',
        'Late (31-120 days)',
        'In Grace Period',
        'Does not meet the credit policy. Status:Fully Paid',
        'Late (16-30 days)',
        'Default',
        'Does not meet the credit policy. Status:Charged Off'
    ],
    'count': [224226, 184739, 42475, 6900, 3146, 1988, 1218, 832, 761]
}

df = pd.DataFrame(data)

In [None]:
df

In [None]:
data_df['loan_status'].tail()

In [None]:
# Mendefinisikan mapping untuk mengganti status

# label_encoder = LabelEncoder() # This line is no longer needed
status_mapping = {
    'Current': 'good',
    'Fully Paid': 'good',
    'Charged Off': 'bad',
    'Late (31-120 days)': 'bad',
    'In Grace Period': 'bad',
    'Does not meet the credit policy. Status:Fully Paid': 'good',
    'Late (16-30 days)': 'bad',
    'Default': 'bad',
    'Does not meet the credit policy. Status:Charged Off': 'bad'
}

# Mengganti status pinjaman di data_df, bukan di df
data_df['loan_status'] = data_df['loan_status'].map(status_mapping)

# Tampilkan hasil
print(data_df['loan_status'])

In [None]:
data_df['loan_status'].head()

In [None]:
data_df

Feature Engineering

ordinal encoder

In [None]:
data_df['term'].value_counts()

In [None]:
data_df['grade'].value_counts()


In [None]:
data_df['sub_grade'].value_counts()


In [None]:
data_df['emp_length'].value_counts()


In [None]:
data_df['verification_status'].value_counts()

In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

In [None]:
# Membuat DataFrame contoh
data = {
    'term': ['36 months', '60 months', '36 months', '60 months', '36 months',
             '60 months', '36 months', '60 months', '36 months', '60 months',
             '36 months', '60 months', '36 months', '60 months', '36 months',
             '60 months', '36 months', '60 months', '36 months', '60 months',
             '36 months', '60 months', '36 months', '60 months', '36 months',
             '60 months', '36 months', '60 months', '36 months', '60 months',
             '36 months', '60 months', '36 months', '60 months', '36 months'],
    'grade': ['B', 'C', 'D', 'A', 'E', 'F', 'G', 'B', 'C', 'D',
              'A', 'E', 'F', 'G', 'B', 'C', 'D', 'A', 'E', 'F',
              'G', 'B', 'C', 'D', 'A', 'E', 'F', 'G', 'B', 'C',
              'D', 'A', 'E', 'F', 'G'],
    'sub_grade': ['B3', 'B4', 'C1', 'C2', 'B2', 'C3', 'B5', 'B3', 'B4', 'C1',
                  'C2', 'B2', 'C3', 'B5', 'B3', 'B4', 'C1', 'C2', 'B2', 'C3',
                  'B5', 'B3', 'B4', 'C1', 'C2', 'B2', 'C3', 'B5', 'B3', 'B4',
                  'C1', 'C2', 'B2', 'C3', 'B5'],
    'emp_length': ['10+ years', '2 years', '3 years', '< 1 year', '5 years',
                   '1 year', '4 years', '10+ years', '2 years', '3 years',
                   '< 1 year', '10+ years', '2 years', '3 years', '< 1 year',
                   '5 years', '1 year', '4 years', '10+ years', '2 years',
                   '3 years', '< 1 year', '10+ years', '2 years', '3 years',
                   '< 1 year', '5 years', '1 year', '4 years', '10+ years',
                   '2 years', '3 years', '< 1 year', '5 years', '1 year'],
    'verification_status': ['Verified', 'Not Verified', 'Verified', 'Verified',
                            'Not Verified', 'Verified', 'Verified', 'Not Verified',
                            'Verified', 'Verified', 'Not Verified', 'Verified',
                            'Verified', 'Not Verified', 'Verified', 'Verified',
                            'Not Verified', 'Verified', 'Verified', 'Not Verified',
                            'Verified', 'Verified', 'Not Verified', 'Verified',
                            'Verified', 'Not Verified', 'Verified', 'Verified',
                            'Not Verified', 'Verified', 'Verified', 'Not Verified',
                            'Verified', 'Verified', 'Not Verified']
}

df = pd.DataFrame(data)

# Menampilkan DataFrame sebelum encoding
print("DataFrame Sebelum Encoding:")
print(df)

# 1. Menggunakan OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
df['term_ordinal'] = ordinal_encoder.fit_transform(df[['term']])
df['grade_ordinal'] = ordinal_encoder.fit_transform(df[['grade']])
df['sub_grade_ordinal'] = ordinal_encoder.fit_transform(df[['sub_grade']])
df['emp_length_ordinal'] = ordinal_encoder.fit_transform(df[['emp_length']])
df['verification_status_ordinal'] = ordinal_encoder.fit_transform(df[['verification_status']])

print("\nSetelah Ordinal Encoding:")
print(df[['term', 'term_ordinal']])
print(df[['grade', 'grade_ordinal']])
print(df[['sub_grade', 'sub_grade_ordinal']])
print(df[['emp_length', 'emp_length_ordinal']])
print(df[['verification_status', 'verification_status_ordinal']])


In [None]:
# Assuming ordinal_encoder is already fitted as in your previous cells
# and data_df is your original DataFrame

data_df['term_ordinal'] = ordinal_encoder.fit_transform(data_df[['term']])
data_df['grade_ordinal'] = ordinal_encoder.fit_transform(data_df[['grade']])
data_df['sub_grade_ordinal'] = ordinal_encoder.fit_transform(data_df[['sub_grade']])
data_df['emp_length_ordinal'] = ordinal_encoder.fit_transform(data_df[['emp_length']])
data_df['verification_status_ordinal'] = ordinal_encoder.fit_transform(data_df[['verification_status']])

# Now you have new columns with ordinal encodings in your original DataFrame

In [None]:
data_df.info()

In [None]:
data_df.drop(['term'], axis=1, inplace=True)
data_df.drop(['grade'], axis=1, inplace=True)
data_df.drop(['sub_grade'], axis=1, inplace=True)
data_df.drop(['emp_length'], axis=1, inplace=True)
data_df.drop(['verification_status'], axis=1, inplace=True)

In [None]:
data_df['home_ownership'].value_counts()

In [None]:
data_df['purpose'].value_counts()


In [None]:
data_df['addr_state'].value_counts()


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
# Membuat DataFrame contoh
data = {
    'home_ownership': ['MORTGAGE', 'RENT', 'OWN', 'OTHER', 'NONE', 'ANY'],
    'purpose': ['debt_consolidation', 'credit_card', 'home_improvement',
                'other', 'major_purchase', 'small_business', 'car', 'medical',
                'moving', 'vacation', 'wedding', 'house', 'educational',  'renewable_energy'],
    'addr_state': ['CA', 'NY', 'TX', 'FL', 'IL', 'NJ', 'PA', 'OH', 'GA', 'VA',
                  'NC', 'MI', 'MA', 'MD', 'AZ', 'WA', 'CO', 'MN', 'MO', 'CT',
                  'IN', 'NV', 'TN', 'OR', 'WI', 'AL', 'SC', 'LA', 'KY', 'KS',
                  'OK', 'AR', 'UT', 'NM', 'HI', 'WV', 'NH', 'RI', 'DC', 'MT',
                  'DE', 'AK', 'MS', 'WY', 'SD', 'VT', 'IA', 'NE', 'ID', 'ME'],
    'initial_list_status': ['f', 'w'],

}

# Find the maximum length among all lists
max_len = max(len(value) for value in data.values())

# Pad shorter lists with None to match the maximum length
padded_data = {
    key: value + [None] * (max_len - len(value))
    for key, value in data.items()
}

df = pd.DataFrame(padded_data)
# Menampilkan DataFrame sebelum encoding
print("DataFrame Sebelum Encoding:")
print(df)

# Menggunakan LabelEncoder untuk kolom 'home_ownership'
label_encoder_home_ownership = LabelEncoder()
df['home_ownership_label'] = label_encoder_home_ownership.fit_transform(df['home_ownership'])

# Menggunakan LabelEncoder untuk kolom 'purpose'
label_encoder_purpose = LabelEncoder()
df['purpose_label'] = label_encoder_purpose.fit_transform(df['purpose'])

# Menggunakan LabelEncoder untuk kolom 'addr_state'
label_encoder_addr_state = LabelEncoder()
df['addr_state_label'] = label_encoder_addr_state.fit_transform(df['addr_state'])

# Menggunakan LabelEncoder untuk kolom 'initial_list_status'
label_encoder_initial_list_status = LabelEncoder()
df['initial_list_status_label'] = label_encoder_initial_list_status.fit_transform(df['initial_list_status'])
# # 1. Menggunakan LabelEncoder
# label_encoder = LabelEncoder()
# df['home_ownership_label'] = label_encoder.fit_transform(df[['home_ownership']])
# df['purpose_label'] = label_encoder.fit_transform(df[['purpose']])
# df['addr_state_label'] = label_encoder.fit_transform(df[['addr_state']])
# df['initial_list_status_label'] = label_encoder.fit_transform(df[['initial_list_status']])





In [None]:
print("\nSetelah Label Encoder:")
print(df['home_ownership_label'])
print(df['purpose_label'])
print(df['addr_state_label'])
print(df['initial_list_status_label'])

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

data_df['home_ownership_label'] = label_encoder.fit_transform(data_df[['home_ownership']])
data_df['purpose_label'] = label_encoder.fit_transform(data_df[['purpose']])
data_df['addr_state_label'] = label_encoder.fit_transform(data_df[['addr_state']])
data_df['initial_list_status_label'] = label_encoder.fit_transform(data_df[['initial_list_status']])

In [None]:
data_df.info()

In [None]:
data_df.drop(['home_ownership'], axis=1, inplace=True)
data_df.drop(['purpose'], axis=1, inplace=True)
data_df.drop(['addr_state'], axis=1, inplace=True)
data_df.drop(['initial_list_status'], axis=1, inplace=True)

In [None]:
data_df.info()

In [None]:
data_df.drop(['last_pymnt_d'], axis=1, inplace=True)
data_df.drop(['next_pymnt_d'], axis=1, inplace=True)
data_df.drop(['last_credit_pull_d'], axis=1, inplace=True)
data_df.drop(['earliest_cr_line'], axis=1, inplace=True)
data_df.drop(['issue_d'], axis=1, inplace=True)

In [None]:
# find numerical variables

numerical = [var for var in data_df.columns if data_df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :', numerical)

In [None]:
data_df[numerical].head()

In [None]:
data_df[numerical].tail()

In [None]:
data_df[numerical].isnull().sum()

**Data Preparation**

In [None]:
data_df

In [None]:
data_df.isnull().sum()

with impute data

In [None]:
# data_df['emp_title'].fillna(data_df['emp_title'].mean(), inplace=True)
data_df['delinq_2yrs'].fillna(data_df['delinq_2yrs'].mean(), inplace=True)
data_df['inq_last_6mths'].fillna(data_df['inq_last_6mths'].mean(), inplace=True)
data_df['mths_since_last_delinq'].fillna(data_df['mths_since_last_delinq'].mean(), inplace=True)
data_df['mths_since_last_record'].fillna(data_df['mths_since_last_record'].mean(), inplace=True)
data_df['open_acc'].fillna(data_df['open_acc'].mean(), inplace=True)
data_df['pub_rec'].fillna(data_df['pub_rec'].mean(), inplace=True)
data_df['revol_util'].fillna(data_df['revol_util'].mean(), inplace=True)
data_df['total_acc'].fillna(data_df['total_acc'].mean(), inplace=True)
data_df['collections_12_mths_ex_med'].fillna(data_df['collections_12_mths_ex_med'].mean(), inplace=True)
data_df['mths_since_last_major_derog'].fillna(data_df['mths_since_last_major_derog'].mean(), inplace=True)
data_df['acc_now_delinq'].fillna(data_df['acc_now_delinq'].mean(), inplace=True)
data_df['tot_coll_amt'].fillna(data_df['tot_coll_amt'].mean(), inplace=True)
data_df['tot_cur_bal'].fillna(data_df['tot_cur_bal'].mean(), inplace=True)
data_df['total_rev_hi_lim'].fillna(data_df['total_rev_hi_lim'].mean(), inplace=True)
data_df['last_pymnt_d_month'].fillna(data_df['last_pymnt_d_month'].mean(), inplace=True)
data_df['next_pymnt_d_month'].fillna(data_df['next_pymnt_d_month'].mean(), inplace=True)
data_df['last_credit_pull_d_month'].fillna(data_df['last_credit_pull_d_month'].mean(), inplace=True)
data_df['earliest_cr_line_month'].fillna(data_df['earliest_cr_line_month'].mean(), inplace=True)
data_df['emp_length_ordinal'].fillna(data_df['emp_length_ordinal'].mean(), inplace=True)

In [None]:
data_df['emp_title'].fillna(data_df['emp_title'].mode()[0], inplace=True)
data_df['annual_inc'].fillna(data_df['annual_inc'].mean(), inplace=True)

In [None]:
data_df.isnull().sum()

In [None]:
data_df.info()

In [None]:
# Memilih hanya kolom numerik
numeric_df = data_df.select_dtypes(include=['number'])

# Menghitung matriks korelasi
correlation_matrix = numeric_df.corr()

# Menampilkan heatmap
plt.figure(figsize=(44, 33))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Heatmap Korelasi Variabel Numerik')
plt.show()

In [None]:
# view summary statistics in numerical variables

print(round(data_df.describe()),4)

In [None]:
# draw boxplots to visualize outliers

plt.figure(figsize=(15,10))


plt.subplot(4, 2, 1)
fig = data_df.boxplot(column='loan_amnt')
fig.set_title('')
fig.set_ylabel('loan_amnt')


plt.subplot(4, 2, 2)
fig = data_df.boxplot(column='funded_amnt')
fig.set_title('')
fig.set_ylabel('funded_amnt')


plt.subplot(4, 2, 3)
fig = data_df.boxplot(column='funded_amnt_inv')
fig.set_title('')
fig.set_ylabel('funded_amnt_inv')


plt.subplot(4, 2, 4)
fig = data_df.boxplot(column='annual_inc')
fig.set_title('')
fig.set_ylabel('annual_inc')



In [None]:
# plot histogram to check distribution

plt.figure(figsize=(15,10))


plt.subplot(4, 2, 1)
fig = data_df.annual_inc.hist(bins=10)
fig.set_xlabel('loan_amnt')
fig.set_ylabel('loan_status')


plt.subplot(4, 2, 2)
fig = data_df.delinq_2yrs.hist(bins=10)
fig.set_xlabel('funded_amnt')
fig.set_ylabel('loan_status')


plt.subplot(4, 2, 3)
fig = data_df.revol_bal.hist(bins=10)
fig.set_xlabel('funded_amnt_inv')
fig.set_ylabel('loan_status')


plt.subplot(4, 2, 4)
fig = data_df.revol_util.hist(bins=10)
fig.set_xlabel('annual_inc')
fig.set_ylabel('loan_status')

In [None]:
# find outliers for loan_amnt variable

IQR = data_df.loan_amnt.quantile(0.75) - data_df.loan_amnt.quantile(0.25)
Lower_fence = data_df.loan_amnt.quantile(0.25) - (IQR * 3)
Upper_fence = data_df.loan_amnt.quantile(0.75) + (IQR * 3)
print('loan_amnt outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for funded_amnt variable

IQR = data_df.funded_amnt.quantile(0.75) - data_df.funded_amnt.quantile(0.25)
Lower_fence = data_df.funded_amnt.quantile(0.25) - (IQR * 3)
Upper_fence = data_df.funded_amnt.quantile(0.75) + (IQR * 3)
print('funded_amnt outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for funded_amnt_inv variable

IQR = data_df.funded_amnt_inv.quantile(0.75) - data_df.funded_amnt_inv.quantile(0.25)
Lower_fence = data_df.funded_amnt_inv.quantile(0.25) - (IQR * 3)
Upper_fence = data_df.funded_amnt_inv.quantile(0.75) + (IQR * 3)
print('funded_amnt_inv outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for revol_util variable

IQR = data_df.revol_util.quantile(0.75) - data_df.revol_util.quantile(0.25)
Lower_fence = data_df.revol_util.quantile(0.25) - (IQR * 3)
Upper_fence = data_df.revol_util.quantile(0.75) + (IQR * 3)
print('revol_util outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for annual_inc variable

IQR = data_df.annual_inc.quantile(0.75) - data_df.annual_inc.quantile(0.25)
Lower_fence = data_df.annual_inc.quantile(0.25) - (IQR * 3)
Upper_fence = data_df.annual_inc.quantile(0.75) + (IQR * 3)
print('annual_inc outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

one hot encoding

In [None]:
data_df['emp_title'].value_counts()

In [None]:
data_df['loan_status'].value_counts()

In [None]:
# data_df = pd.concat([data_df, Emp_title, Loan_status], axis=1)

**Data Preparation**

work with emp_title and loan_status variable

In [None]:
# Membuat DataFrame contoh
data = {
    'loan_status': ['good', 'bad']
}

# Find the maximum length among all lists
max_len = max(len(value) for value in data.values())

# Pad shorter lists with None to match the maximum length
padded_data = {
    key: value + [None] * (max_len - len(value))
    for key, value in data.items()
}

df = pd.DataFrame(padded_data)
# Menampilkan DataFrame sebelum encoding
print("DataFrame Sebelum Encoding:")
print(df)

# Menggunakan LabelEncoder untuk kolom 'home_ownership'
label_encoder_loan_status = LabelEncoder()
df['loan_status_label'] = label_encoder_loan_status.fit_transform(df['loan_status'])


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
print("\nSetelah Label Encoder:")
print(df['loan_status_label'])

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

data_df['loan_status_label'] = label_encoder_loan_status.fit_transform(data_df['loan_status'])


In [None]:
data_df.info()

In [None]:
data_df.drop(['emp_title'], axis=1, inplace=True)
data_df.drop(['loan_status'], axis=1, inplace=True)

In [None]:
data_df.info()

In [None]:
X = data_df.drop(['loan_status_label'], axis=1)

y = data_df['loan_status_label']

In [None]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

In [None]:
# check data types in X_train

X_train.dtypes

In [None]:
# display categorical variables

categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']

categorical

In [None]:
# display numerical variables

numerical = [col for col in X_train.columns if X_train[col].dtypes != 'O']

numerical

Feature Scalling

In [None]:
X_train.describe()

In [None]:
X_train = pd.DataFrame(X_train, columns=X_train.columns) # Convert X_train back to DataFrame

In [None]:
cols = X_train.columns


In [None]:
print(X_train.columns)
print(X_test.columns)

In [None]:
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [None]:
X_test = X_test[X_train.columns]

In [None]:
data_df.info()

In [None]:
data_df

In [None]:
data_df.info()

In [None]:
X_train.info()

In [None]:
# from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print("X_train_scaled:")
print(X_train_scaled)


In [None]:
print("\nX_test_scaled:")
print(X_test_scaled)

In [None]:
X_train.describe()

Model Training

In [None]:
data_df['loan_status_label'].value_counts()

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
y_train.head()

In [None]:
y_test.head()

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(X_train, y_train)
y_pred_test = logreg.predict(X_test)
print('Training set score: {:.4f}'.format(logreg.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(logreg.score(X_test, y_test)))

# Instantiate the model
logistic_regression_model = LogisticRegression(random_state=0)

# Fit the model
logistic_regression_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = logistic_regression_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Classifier:")
print(classification_report(y_test, y_pred_lr))

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# y_pred_test

In [None]:
# probability of getting output as 0 - no rain

# logreg.predict_proba(X_test)[:,0]

In [None]:
# probability of getting output as 1 - rain

# logreg.predict_proba(X_test)[:,1]

In [None]:
from sklearn.metrics import accuracy_score

# print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

In [None]:
# y_pred_train = logreg.predict(X_train)

# y_pred_train

In [None]:
# print the scores on training and test set



In [None]:
# fit the Logsitic Regression model with C=001

# instantiate the model
# logreg001 = LogisticRegression(C=0.01, solver='liblinear', random_state=0)


# # fit the model
# logreg001.fit(X_train, y_train)

In [None]:
# print the scores on training and test set

# print('Training set score: {:.4f}'.format(logreg001.score(X_train, y_train)))

# print('Test set score: {:.4f}'.format(logreg001.score(X_test, y_test)))

In [None]:
# check class distribution in test set

# y_test.value_counts()

In [None]:
# check null accuracy score

# null_accuracy = (82206/(82206+11051))

# print('Null accuracy score: {0:0.4f}'. format(null_accuracy))

1. random forest

In [None]:
# train a random forest model on the training set
# from sklearn.ensemble import RandomForestClassifier # Import RandomForestClassifier from sklearn.ensemble

# # instantiate the model
# random_forest = RandomForestClassifier(random_state=0) # Remove solver='liblinear'

# # fit the model
# random_forest.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred= rf_clf.predict(X_test)
print("Accuracy on Traing set: ",rf_clf.score(X_train,y_train))
print("Accuracy on Testing set: ",rf_clf.score(X_test,y_test))

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Instantiate the model
random_forest_model = RandomForestClassifier(random_state=0)

# Fit the model
random_forest_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the model
print("Random Forest Classifier:")
print(classification_report(y_test, y_pred_rf))

# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

2. XGBoost Classifier


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Instantiate the model
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgb_clf.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_clf.predict(X_test)

# Evaluate the model
print("XGBoost Classifier:")
print("Accuracy on Training set: ", xgb_clf.score(X_train, y_train))
print("Accuracy on Testing set: ", xgb_clf.score(X_test, y_test))

# Optional: Print classification report for more detailed evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))

# Print the Confusion Matrix and slice it into four pieces

# Instantiate the model
xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgboost_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgboost_model.predict(X_test)

# Evaluate the model
print("XGBoost Classifier:")
print(classification_report(y_test, y_pred_xgb))

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

3. LightGBM

In [None]:
from lightgbm import LGBMClassifier

# Instantiate the model
lgbm_clf = LGBMClassifier()

# Fit the model
lgbm_clf.fit(X_train, y_train)

# Make predictions
y_pred_lgbm = lgbm_clf.predict(X_test)

# Evaluate the model
print("LightGBM Regressor:")
print("Accuracy on Training set: ", lgb_reg.score(X_train, y_train))
print("Accuracy on Testing set: ", lgb_reg.score(X_test, y_test))

# Instantiate the model
lightgbm_model = LGBMClassifier()

# Fit the model
lightgbm_model.fit(X_train, y_train)

# Make predictions
y_pred_lgb = lightgbm_model.predict(X_test)

# Evaluate the model
print("LightGBM Classifier:")
print(classification_report(y_test, y_pred_lgb))

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

4. Decision Tree Classifier


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Instantiate the model
dt_clf = DecisionTreeClassifier()

# Fit the model
dt_clf.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_clf.predict(X_test)

# Evaluate the model
print("Decision Tree Regressor:")
print("Accuracy on Training set: ", dt_clf.score(X_train, y_train))
print("Accuracy on Testing set: ", dt_clf.score(X_test, y_test))

# Instantiate the model
decision_tree_model = DecisionTreeClassifier(random_state=0)

# Fit the model
decision_tree_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = decision_tree_model.predict(X_test)

# Evaluate the model
print("Decision Tree Classifier:")
print(classification_report(y_test, y_pred_dt))

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])