In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
import matplotlib.pyplot as plt
%matplotlib inline

warnings.filterwarnings("ignore")

user_spec = pd.read_csv("user_1_dsrprepay_1.csv")

### 5. 변수선택법
### 5-1. 모듈 돌리기 전 음수 value 갖는 컬럼들 전처리

In [2]:
user_spec['work_year'] = user_spec['work_year'].round()
user_spec['work_year'] = user_spec[user_spec['work_year']<0]=0
user_spec['existing_loan_cnt'] = user_spec['existing_loan_cnt'].round()
user_spec['existing_loan_cnt'] = user_spec[user_spec['existing_loan_cnt']<0]=0
user_spec['existing_loan_amt'] = user_spec[user_spec['existing_loan_amt']<0]=0

### 5-2. 변수선택법 모듈 돌리기 위하여 다시 한 번 'is_applied' 컬럼과 merge
> 변수선택법 모듈 (카이제곱통계량 활용) 사용시 타겟값을 기준으로 분포 간 상관관계를 파악해 변수의 중요도를 매기기 때문에, is_applied 컬럼이 필요하다.

In [3]:
loan_applied = pd.read_csv("loan_result.csv", usecols=['application_id', 'is_applied','loanapply_insert_time'])

In [4]:
loan_applied.head()

Unnamed: 0,application_id,loanapply_insert_time,is_applied
0,1748340,2022-06-07 13:05:41,
1,1748340,2022-06-07 13:05:41,
2,1748340,2022-06-07 13:05:41,
3,1748340,2022-06-07 13:05:41,
4,1748340,2022-06-07 13:05:41,


In [5]:
user_spec.drop(['insert_time'],axis=1, inplace=True)
columns_int = ['user_id']
columns_float = user_spec.columns[:-1]

for i in columns_int:
    user_spec[i]= user_spec[i].astype(np.int32)
for i in columns_float:
    user_spec[i]= user_spec[i].astype(np.float32)

In [6]:
user_spec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1439446 entries, 0 to 1439445
Data columns (total 32 columns):
 #   Column                               Non-Null Count    Dtype  
---  ------                               --------------    -----  
 0   age_cat                              1439446 non-null  float32
 1   application_id                       1439446 non-null  float32
 2   desired_amount                       1439446 non-null  float32
 3   employment_type_기타                   1439446 non-null  float32
 4   employment_type_일용직                  1439446 non-null  float32
 5   employment_type_정규직                  1439446 non-null  float32
 6   existing_loan_amt                    1439446 non-null  float32
 7   existing_loan_cnt                    1439446 non-null  float32
 8   gender                               1439446 non-null  float32
 9   houseown_type_배우자                    1439446 non-null  float32
 10  houseown_type_자가                     1439446 non-null  float32
 11

### 5-3. 6월을 기준으로 train / test 컬럼을 나누고 모듈을 돌려 컬럼을 선택한다.

In [7]:
user_applied = pd.merge(user_spec, loan_applied, how='inner', on='application_id')

In [8]:
user_applied['loanapply_insert_time']=pd.to_datetime(user_applied['loanapply_insert_time'])
user_applied['month'] = user_applied['loanapply_insert_time'].dt.month
train = user_applied[user_applied['month']<6]
test = user_applied[user_applied['month']>=6]

In [9]:
train.drop(['loanapply_insert_time'],axis=1, inplace=True)
test.drop(['loanapply_insert_time'],axis=1, inplace=True)

In [10]:
from sklearn.model_selection import train_test_split
X = train[train.columns.difference(['is_applied', 'application_id'])]
y = train[['is_applied']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(user_applied.shape)

(7642656, 32)
(3275424, 32)
(7642656, 1)
(3275424, 1)
(14301807, 35)


In [11]:
# target(Price)와 가장 correlated 된 features 를 k개 고르기.
## f_regresison, SelectKBest 불러오기.
from sklearn.feature_selection import chi2, SelectKBest
## selctor 정의하기.
selector = SelectKBest(chi2, k=20)
## 학습데이터에 fit_transform 
X_train_selected = selector.fit_transform(X_train, y_train)
## 테스트 데이터는 transform
X_test_selected = selector.transform(X_test)
X_train_selected.shape, X_test_selected.shape

((7642656, 20), (3275424, 20))

In [12]:
all_names = X_train.columns
## selector.get_support()
selected_mask = selector.get_support()
## 선택된 특성(변수)들
selected_names = all_names[selected_mask]
## 선택되지 않은 특성(변수)들
unselected_names = all_names[~selected_mask]
print('Selected names: ', selected_names)
print('Unselected names: ', unselected_names)

Selected names:  Index(['DSRCalc_freq', 'PrepayCalc_freq', 'age_cat', 'desired_amount',
       'employment_type_기타', 'employment_type_일용직', 'employment_type_정규직',
       'houseown_type_자가', 'income_type_EARNEDINCOME2',
       'income_type_FREELANCER', 'income_type_OTHERINCOME',
       'income_type_PRIVATEBUSINESS', 'purpose_대환대출', 'purpose_생활비',
       'purpose_전월세보증금', 'purpose_주택구입', 'purpose_투자', 'score_type', 'user_id',
       'yearly_income'],
      dtype='object')
Unselected names:  Index(['existing_loan_amt', 'existing_loan_cnt', 'gender', 'houseown_type_배우자',
       'houseown_type_전월세', 'income_type_PRACTITIONER', 'month',
       'personal_rehabilitation_complete_yn', 'personal_rehabilitation_yn',
       'purpose_사업자금', 'purpose_자동차구입', 'work_year'],
      dtype='object')


In [14]:
train.drop(['houseown_type_전월세','houseown_type_배우자','houseown_type_자가', 'personal_rehabilitation_complete_yn', 'personal_rehabilitation_complete_yn', 'month','gender'],axis=1, inplace=True)

In [15]:
test.drop(['houseown_type_전월세','houseown_type_배우자','houseown_type_자가', 'personal_rehabilitation_complete_yn', 'personal_rehabilitation_complete_yn', 'month','gender'],axis=1, inplace=True)

In [18]:
condition = (train.purpose_사업자금 == 1) | (train.purpose_자동차구입== 1)|(train.purpose_전월세보증금 == 1)|(train.purpose_주택구입 == 1)|(train.purpose_투자 == 1)
train.loc[condition, 'purpose_ETC']=1
train.drop(['purpose_사업자금', 'purpose_자동차구입', 'purpose_전월세보증금', 'purpose_주택구입', 'purpose_투자'],axis=1, inplace=True)

AttributeError: 'DataFrame' object has no attribute 'purpose_사업자금'

In [20]:
condition = (test.purpose_사업자금 == 1) | (test.purpose_자동차구입== 1)|(test.purpose_전월세보증금 == 1)|(test.purpose_주택구입 == 1)|(test.purpose_투자 == 1)
test.loc[condition, 'purpose_ETC']=1
test.drop(['purpose_사업자금', 'purpose_자동차구입', 'purpose_전월세보증금', 'purpose_주택구입', 'purpose_투자'],axis=1, inplace=True)

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10918080 entries, 1726 to 14301806
Data columns (total 24 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   age_cat                      float32
 1   application_id               float32
 2   desired_amount               float32
 3   employment_type_기타           float32
 4   employment_type_일용직          float32
 5   employment_type_정규직          float32
 6   existing_loan_amt            float32
 7   existing_loan_cnt            float32
 8   income_type_EARNEDINCOME2    float32
 9   income_type_FREELANCER       float32
 10  income_type_OTHERINCOME      float32
 11  income_type_PRACTITIONER     float32
 12  income_type_PRIVATEBUSINESS  float32
 13  personal_rehabilitation_yn   float32
 14  purpose_대환대출                 float32
 15  purpose_생활비                  float32
 16  score_type                   float32
 17  work_year                    float32
 18  yearly_income                float32


In [23]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3383727 entries, 0 to 14300172
Data columns (total 24 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   age_cat                      float32
 1   application_id               float32
 2   desired_amount               float32
 3   employment_type_기타           float32
 4   employment_type_일용직          float32
 5   employment_type_정규직          float32
 6   existing_loan_amt            float32
 7   existing_loan_cnt            float32
 8   income_type_EARNEDINCOME2    float32
 9   income_type_FREELANCER       float32
 10  income_type_OTHERINCOME      float32
 11  income_type_PRACTITIONER     float32
 12  income_type_PRIVATEBUSINESS  float32
 13  personal_rehabilitation_yn   float32
 14  purpose_대환대출                 float32
 15  purpose_생활비                  float32
 16  score_type                   float32
 17  work_year                    float32
 18  yearly_income                float32
 19 

In [25]:
train.to_csv("train_수정_2.csv",index=False)
test.to_csv("train_수정_2.csv",index=False)