In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# File system manangement
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

This time I transcripted by referring to the link below.  
https://www.kaggle.com/code/willkoehrsen/start-here-a-gentle-introduction/notebook  


Supervised: The labels are included in the training data and the goal is to train a model to learn to predict the labels from the features  
Classification: The label is a binary variable, 0 (will repay loan on time), 1 (will have difficulty repaying loan)  


Follow-up Notebooks

For those looking to keep working on this problem, I have a series of follow-up notebooks:

- Manual Feature Engineering Part One  
- Manual Feature Engineering Part Two  
- Introduction to Automated Feature Engineering  
- Advanced Automated Feature Engineering
- Feature Selection
- Intro to Model Tuning: Grid and Random Search
- Automated Model Tuning
- Model Tuning Results


In [3]:
# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Training data
app_train = pd.read_csv('../input/home-credit/application_train.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

In [5]:
# Testing data features
app_test = pd.read_csv('../input/homecredit/application_test.csv')
print('Testing data shape: ', app_test.shape)
app_test.head()

In [12]:
app_train['TARGET'].value_counts()

In [7]:
# print(app_train['TARGET'].astype(int).plot.hist());
app_train['TARGET'].plot.hist();

From this information, we see this is an imbalanced class problem. There are far more loans that were repaid on time than loans that were not repaid. 

### Examine Missing Values
https://suhyun-cho.github.io/kaggle/kaggle-HomeCredit-default-risk-eda-and-FeatureEngineering_beginner/

In [20]:
# mis_val = df.isnill().sum()


# We cannot check all misng values if there are lots of columns. 
# mis_val = app_train.isna().sum()


When it comes time to build our machine learning models, we will have to fill in these missing values (known as imputation).   
In later work, we will use models such as XGBoost that can handle missing values with no need for imputation.   
Another option would be to drop columns with a high percentage of missing values, although it is impossible  
to know ahead of time if these columns will be helpful to our model. 
Therefore, we will keep all of the columns for now.  

### Column Types

Let's look at the number of columns of each data type.  
int64 and float64 are numeric variables (which can be either discrete or continuous).  
object columns contain strings and are categorical features.  

In [21]:
app_train.dtypes.value_counts()

Let's now look at the number of unique entities in each of the object(categorical)columns.

In [24]:
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

### Encoding Categorical Variables

In [15]:
# 이 노트북에서는 범주형변수의 유니크한 값이 2개일경우 Label encoding을 사용하고
#그 이상일 경우 One-hot encoding을 사용한다.
le=LabelEncoder()
le_count=0

# 컬럼별로 iterate 돌기
for col in app_train:
    # df[리스트] 
    # dataframe의 경우 
    if app_train[col].dtype=='object':
        # 데이터타입이 object이고 값의 종류가 두개 이하일경우,
        if len(list(app_train[col].unique())) <=2:  
            
            # train과 test에 동일하게 라벨인코딩을 하기위해 train기준으로 fit한값을 train,test에 동일하게 transform해줌
            le.fit(app_train[col])
            
            # train-set, test-set 둘다 Transform
            app_train[col]=le.transform(app_train[col])
            app_test[col]=le.transform(app_test[col])
            
            # 라벨인코딩을 한 컬럼이 몇개인지 카운트
            le_count+=1                                                    # Q 어떻게 카운트하는 건지?
print('%d columns were label encoded.' % le_count)

In [9]:
# 위에서 Label-encoding적용 안한 나머지 범주형 변수에 One-hot encoding 적용
# 위에서 이미 OBJECT 값이 2개인경우는 Label encoding 했으니까, 여기서는 바로 나머지 obect애 대해서는 get.cummies를 쫙 해준다.  
app_train=pd.get_dummies(app_train)
app_test=pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape

In [None]:
# TARGET변수는 train데이터에만 있지만 필요한 변수이기때문에 따로 빼두고나서 다시추가할것
train_labels=app_train['TARGET']

"""
컬럼 수가 비교적 작은 train_set을 기준으로 컬럼수를 align해서 맞춰 준다. train-set과 test-set을 align한다.
즉, train 데이터와 test 데이터에 둘다 공통적으로 있는 컬럼들의 값만 가져오려는것
"""
app_train, app_test=app_train.align(app_test,join='inner',axis=1)

# TARGET변수 다시 추가
app_train['TARGET']=train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

### Train데이터와 Test데이터 컬럼 맞춰주기
train 데이터와 test 데이터에는 동일한 feature가 있어야 한다.  
train 데이터에 있는 카테고리변수의 유니크한 값 개수와 test 데이터에 있는 카테고리 변수의 유니크한 값 개수가 다른 변수들이 있어서 one-hot-encoding을 했더니, train에는 있는데 test에 없는 컬럼들이 생겨버림.  

따라서 test 데이터에 없고 train에만 있는 컬럼을 삭제해야됨.  

우선, train 데이터에서 TARGET 컬럼을 뽑아낸다. * TARGET 컬럼은 test데이터에 없어도 train 데이터에는 반드시 있어야하기 때문에  

align() 함수의 join메소드를 inner로 적용해서 교집합으로 있는 변수만 추린다.  

In [None]:
# DAYS_BIRTH 컬럼에서는 이상치 없어보임
(app_train['DAYS_BIRTH'] / -365).describe()

In [None]:
# DAYS_EMPLOYED는 이상치..
app_train['DAYS_EMPLOYED'].describe()

In [None]:
app_train["DAYS_EMPLOYED"].value_counts()

Just out of curiousity, let's subset the anomalous clients and see if they tend to have higher or low rates of default than the rest of the clients.  
- Only index 365243 has plus value. 
- anom : 365243
- non_anom: != 365243

In [None]:
anom = app_train[app_train['DAYS_EMPLOYED'] == 365243]
non_anom = app_train[app_train['DAYS_EMPLOYED'] != 365243]

print('The anomalies default on %0.2f%% of loans' % (100 * anom['TARGET'].mean()))
print('The non-anomalies default on %0.2f%% of loans' % (100 * non_anom['TARGET'].mean()))
print('There are %d anomalous days of employment' % len(anom))

이상치로 보이는 고객들이 대출을 상환하지못할 확률이 5.4%로 더 낮음.  
이상치를 다루는 가장 안전한 방법은 결측치 채우듯이 채우는 방법  
이 경우 모든 이상치들이 같은값을 갖고 있으므로, 다 같은 값으로 채울것이다.  
이상값들이 중요해보이니, 머신러닝 모델에 이 이상값들을 임의로 채운것에대해 알려줄것이다.  

In [None]:
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

print('There are %d anomalies in the test data out of %d entries' % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)))


'''
Q) 

anom_test = app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

print('There are %d anomalies in the test data out of %d entries' % (len(anom_test), len(app_test))) 

해도 되나?
'''
Q

In [None]:
# 최소 20 최대 70, 11개의 간격으로 >> 총 10개로 그룹핑
## 결과는 20이상 25미만, 25이상 30미만,,,, 으로 그룹핑됨. 단 (,)는 포함 [,]는 미포함을 의미. (20,25] 20이상 25미만 
np.linspace(20,70,num=11)

In [None]:
"""
cut() 함수를 사용해서 5살 간격으로 나이대 그룹을 나눠보자. 
그다음, 각 나이대 별로 대출상환을 못하는 비율을 체크
"""

age_data=app_train[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH']=age_data['DAYS_BIRTH']/365

# Bin the age data
# 
age_data['YEARS_BINNED']=pd.cut(age_data['YEARS_BIRTH'],bins=np.linspace(20,70,num=11))
age_data.head(10)