<a href="https://colab.research.google.com/github/chereunii/chereunii.github.io/blob/main/3%EC%A3%BC%EC%B0%A8_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 대출 가능성 예측하기

**About the loan_data.csv file:** \
- Loan_ID: A unique loan ID.

- Gender: Either male or female.

- Married: Weather Married(yes) or Not Marttied(No).

- Dependents: Number of persons depending on the client
  - [부양가족 수: 신청인에게 의존하는 사람의 수]

- Education: Applicant Education(Graduate or Undergraduate).
Self_Employed: Self-employed (Yes/No).

  - [자영업 여부: 자영업자인지 여부 (Yes: 자영업, No: 자영업 아님)]

- ApplicantIncome: Applicant income.

  - [신청자 소득: 신청인의 소득]

- CoapplicantIncome: Co-applicant income.

  - [공동 신청자 소득: 공동 신청자의 소득]

- LoanAmount: Loan amount in thousands.
- Loan_Amount_Term: Terms of the loan in months.
- Credit_History: Credit history meets guidelines.

  - [신용 기록: 신용 기록이 대출 기준을 충족하는지 여부]

- Property_Area: Applicants are living either Urban, Semi-Urban or Rural.
- Loan_Status: Loan approved (Y/N).

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 데이터 불러오기
data = pd.read_csv('/content/loan_data.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95,360.0,1.0,Urban,Y


#### 데이터 탐색 및 전처리

In [4]:
# 데이터 정보 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    int64  
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(3), int64(2), object(8)
memory usage: 38.8+ KB


In [5]:
# 결측치 확인
print(data.isnull().sum())

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64


In [6]:
# 전체 결측값이 있는 행 삭제
data = data.dropna()

In [7]:
# 결과 확인
print(data.isnull().sum())  # 모든 결측값이 0이 되었는지 확인

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [8]:
# 기초 통계량 확인
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,308.0,308.0,308.0,308.0,308.0
mean,3599.126623,1278.434805,104.623377,341.181818,0.853896
std,1462.359612,2520.961308,29.382256,68.246006,0.353785
min,150.0,0.0,9.0,36.0,0.0
25%,2568.75,0.0,89.75,360.0,1.0
50%,3329.5,871.5,110.0,360.0,1.0
75%,4291.0,1953.5,128.0,360.0,1.0
max,9703.0,33837.0,150.0,480.0,1.0


In [9]:
# 유니크한 값을 확인하고자 하는 컬럼 리스트
columns_to_check = ['Gender', 'Married', 'Education','Self_Employed', 'Credit_History', 'Property_Area', 'Loan_Status']

# 선택한 컬럼들의 유니크한 값 확인
for column in columns_to_check:
    unique_values = data[column].unique()
    print(f"Unique values in column '{column}':")
    print(unique_values)
    print("\n")

Unique values in column 'Gender':
['Male' 'Female']


Unique values in column 'Married':
['Yes' 'No']


Unique values in column 'Education':
['Graduate' 'Not Graduate']


Unique values in column 'Self_Employed':
['No' 'Yes']


Unique values in column 'Credit_History':
[1. 0.]


Unique values in column 'Property_Area':
['Rural' 'Urban' 'Semiurban']


Unique values in column 'Loan_Status':
['N' 'Y']




#### 더미변수 가공

In [10]:
# 필요한 컬럼 선택
target_col = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']

# 타겟 변수 설정 (loan_status)
predict_data = data[target_col]
predict_data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508.0,128,360.0,1.0,Rural,N
1,Male,Yes,0,Graduate,Yes,3000,0.0,66,360.0,1.0,Urban,Y
2,Male,Yes,0,Not Graduate,No,2583,2358.0,120,360.0,1.0,Urban,Y
3,Male,No,0,Graduate,No,6000,0.0,141,360.0,1.0,Urban,Y
4,Male,Yes,0,Not Graduate,No,2333,1516.0,95,360.0,1.0,Urban,Y


In [11]:
# 더미변수 생성
predict_data = pd.get_dummies(predict_data)
predict_data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,...,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_N,Loan_Status_Y
0,4583,1508.0,128,360.0,1.0,False,True,False,True,False,...,False,True,False,True,False,True,False,False,True,False
1,3000,0.0,66,360.0,1.0,False,True,False,True,True,...,False,True,False,False,True,False,False,True,False,True
2,2583,2358.0,120,360.0,1.0,False,True,False,True,True,...,False,False,True,True,False,False,False,True,False,True
3,6000,0.0,141,360.0,1.0,False,True,True,False,True,...,False,True,False,True,False,False,False,True,False,True
4,2333,1516.0,95,360.0,1.0,False,True,False,True,True,...,False,False,True,True,False,False,False,True,False,True


In [12]:
# 필요없는 컬럼 삭제
del predict_data['Gender_Male']
del predict_data['Married_No']
del predict_data['Dependents_0']
del predict_data['Education_Not Graduate']
del predict_data['Self_Employed_No']
del predict_data['Property_Area_Rural']
del predict_data['Loan_Status_N']
predict_data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,4583,1508.0,128,360.0,1.0,False,True,True,False,False,True,False,False,False,False
1,3000,0.0,66,360.0,1.0,False,True,False,False,False,True,True,False,True,True
2,2583,2358.0,120,360.0,1.0,False,True,False,False,False,False,False,False,True,True
3,6000,0.0,141,360.0,1.0,False,False,False,False,False,True,False,False,True,True
4,2333,1516.0,95,360.0,1.0,False,True,False,False,False,False,False,False,True,True


#### decision tree 예측 모델 구축

In [13]:
# 의사결정 트리 알고리즘 사용 -> 모델 구축
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection

approved = predict_data.loc[predict_data["Loan_Status_Y"]==1]
denied = predict_data.loc[predict_data["Loan_Status_Y"]==0]

# 두 데이터프레임 중 작은 크기로 샘플링: 학습 데이터에서 불균형이 발생하지 않도록 하기 위함.
min_size = min(len(approved), len(denied))
approved = approved.sample(min_size, random_state=42)
denied = denied.sample(min_size, random_state=42)

X = pd.concat([approved, denied], ignore_index=True)
y = X["Loan_Status_Y"]
del X["Loan_Status_Y"]
X_train, X_test, y_train, y_test =sklearn.model_selection.train_test_split(X, y, random_state=42)

dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)
y_test_pred = dt.predict(X_test).astype(int)
print(y_test_pred)

[0 1 0 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1 0
 0 0 1 1 1 1 0 0]


In [18]:
# 실제값과 예측값 비교
# y_test는 bool 형태로 나타나기 때문에, 이것을 정수형으로 바꿔서 -> 표 안에 숫자만 있도록 출력하세요.
results_test = pd.DataFrame({'y_test': y_test, 'y_pred': y_test_pred}).astype('int')
results_test.head()

Unnamed: 0,y_test,y_pred
19,1,0
45,1,1
140,0,0
30,1,1
67,1,0


#### 예측 모델 평가 및 모델 튜닝

In [19]:
# reseults_test 데이터를 집계해서 정답률 계산
correct = len(results_test.loc[results_test["y_test"]==results_test["y_pred"]])
data_count = len(results_test)
score_test = correct / data_count
print(score_test)

0.7333333333333333


In [21]:
# "your code"를 이용한 정답률 계산
print(dt.score(X_test, y_test))
print(dt.score(X_train, y_train))

0.7333333333333333
1.0


In [23]:
# 모델 튜닝 - 트리 깊이 조정
X = pd.concat([approved, denied], ignore_index=True)
y = X["Loan_Status_Y"]
del X["Loan_Status_Y"]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)

dt = DecisionTreeClassifier(random_state=0, max_depth=8)
dt.fit(X_train, y_train)
print(dt.score(X_test, y_test))
print(dt.score(X_train, y_train))

0.7111111111111111
0.9398496240601504


#### 모델에 기여하는 변수 확인

In [24]:
# 변수 중요도 확인
importance = pd.DataFrame({"feature_names":X.columns, "coefficient":dt.feature_importances_})
importance

Unnamed: 0,feature_names,coefficient
0,ApplicantIncome,0.082485
1,CoapplicantIncome,0.065794
2,LoanAmount,0.244453
3,Loan_Amount_Term,0.082037
4,Credit_History,0.36478
5,Gender_Female,0.026671
6,Married_Yes,0.039182
7,Dependents_1,0.0
8,Dependents_2,0.021167
9,Dependents_3+,0.0


In [25]:
# 1. 중요도가 0.05 이하인 변수 제거
# 0.05 이하인 변수들을 확인한 후, 해당 변수들을 제거합니다.
selected_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Married_Yes']

X_selected = X[selected_columns]

# 2. 데이터 분할 및 모델 재학습
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection

# Train-test split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_selected, y, random_state=42)

# 의사결정 트리 모델 학습 (기존 max_depth 사용)
dt = DecisionTreeClassifier(random_state=0, max_depth=10)
dt.fit(X_train, y_train)

# 3. 모델 성능 평가
train_score = dt.score(X_train, y_train)
test_score = dt.score(X_test, y_test)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")

Train Score: 0.9849624060150376
Test Score: 0.7777777777777778


직전에 실행한 코드의 결과와 비교하면 \
Test Score: 0.7592592592592593 \
Train Score: 0.9838709677419355 \
이라는 성능을 보이던 첫번째 모델 튜닝 이후, 약간의 과적합이 발생하고 있음.
다만, Test Score도 일부 개선되었기 때문에 과적합이 심화되지는 않았다고 판단할 수 있음. 이 정도의 Train-Test Score 차이는 어느 정도 허용될 수 있지만, 과적합을 더 줄이려면 모델의 복잡도를 줄이거나 교차 검증을 활용하는 등의 방법을 고려할 수 있음.



#### 대출 승인 여부 예측

In [26]:
# 예시로 입력받을 새로운 값 설정
ApplicantIncome = 5000
CoapplicantIncome = 2000
LoanAmount = 150
Loan_Amount_Term = 360
Credit_History = 1
Married_Yes = "Y"

In [27]:
# 결혼 여부 변환
if Married_Yes == "Y":
    married_val = 1
else:
    married_val = 0

# 신용 기록 변환
if Credit_History == 1:
    credit_history_val = 1
else:
    credit_history_val = 0

# input_data 리스트 생성
input_data = [
    ApplicantIncome,
    CoapplicantIncome,
    LoanAmount,
    Loan_Amount_Term,
    credit_history_val,
    married_val
]

# 예측 수행
print(dt.predict([input_data]))  # 예측 결과 출력
print(dt.predict_proba([input_data]))  # 예측 확률 출력


[ True]
[[0. 1.]]
