In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
import random
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [2]:
# 데이터 불러오기
train = pd.read_csv('./train.csv')
train = train.drop(['index'], axis=1)

test = pd.read_csv('./test.csv')
test = test.drop(['index'], axis=1)

submit = pd.read_csv('./sample_submission.csv')

In [3]:
# 결측치 확인
print('훈련데이터 결측치 합 \n', train.isnull().sum())
print('테스트데이터 결측치 합 \n', test.isnull().sum())

훈련데이터 결측치 합 
 gender              0
car                 0
reality             0
child_num           0
income_total        0
income_type         0
edu_type            0
family_type         0
house_type          0
DAYS_BIRTH          0
DAYS_EMPLOYED       0
FLAG_MOBIL          0
work_phone          0
phone               0
email               0
occyp_type       8171
family_size         0
begin_month         0
credit              0
dtype: int64
테스트데이터 결측치 합 
 gender              0
car                 0
reality             0
child_num           0
income_total        0
income_type         0
edu_type            0
family_type         0
house_type          0
DAYS_BIRTH          0
DAYS_EMPLOYED       0
FLAG_MOBIL          0
work_phone          0
phone               0
email               0
occyp_type       3152
family_size         0
begin_month         0
dtype: int64


In [4]:
def days_to_age(x):
    return (x*-1)/365

# train
train['DAYS_BIRTH'] = train['DAYS_BIRTH'].apply(days_to_age)
train['DAYS_BIRTH'] = train['DAYS_BIRTH'].astype({'DAYS_BIRTH':'int'})
display(train['DAYS_BIRTH'])

# test
test['DAYS_BIRTH'] = test['DAYS_BIRTH'].apply(days_to_age)
test['DAYS_BIRTH'] = test['DAYS_BIRTH'].astype({'DAYS_BIRTH':'int'})
display(test['DAYS_BIRTH'])

0        38
1        31
2        52
3        41
4        41
         ..
26452    33
26453    41
26454    27
26455    27
26456    53
Name: DAYS_BIRTH, Length: 26457, dtype: int32

0       60
1       51
2       43
3       52
4       48
        ..
9995    50
9996    29
9997    57
9998    45
9999    25
Name: DAYS_BIRTH, Length: 10000, dtype: int32

In [5]:
def days_to_year(x):
    if x==365243 :
        return 0
    return (x*-1)/365+1

# train
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(days_to_year)
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].astype({'DAYS_EMPLOYED':'int'})
display(train['DAYS_EMPLOYED'])

# test
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].apply(days_to_year)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].astype({'DAYS_EMPLOYED':'int'})
display(test['DAYS_EMPLOYED'])

0        13
1         5
2        13
3         6
4         6
         ..
26452     6
26453     7
26454     6
26455     1
26456     3
Name: DAYS_EMPLOYED, Length: 26457, dtype: int32

0        0
1       24
2        1
3        7
4       26
        ..
9995    15
9996     4
9997    39
9998     3
9999     1
Name: DAYS_EMPLOYED, Length: 10000, dtype: int32

In [6]:
def minus(x):
    return x * -1

# train
train['begin_month'] = train['begin_month'].apply(minus)
display(train['begin_month'])

# test
test['begin_month'] = test['begin_month'].apply(minus)
display(test['begin_month'])

0         6.0
1         5.0
2        22.0
3        37.0
4        26.0
         ... 
26452     2.0
26453    47.0
26454    25.0
26455    59.0
26456     9.0
Name: begin_month, Length: 26457, dtype: float64

0       60.0
1       36.0
2       40.0
3       41.0
4        8.0
        ... 
9995    19.0
9996    34.0
9997    55.0
9998    33.0
9999    11.0
Name: begin_month, Length: 10000, dtype: float64

In [7]:
# income_type이 pension이면서 occyp_type이 Null인 
#train.drop('credit', axis = 1, inplace = True)
#test.drop('credit', axis = 1, inplace = True)

cond = (train['income_type'] == 'Pensioner')
train['occyp_type'] = train['occyp_type'].fillna(cond.map({True:'Retired', False: 'NaN'}))
train[(train['occyp_type'] == 'Retired')].count()

gender           4440
car              4440
reality          4440
child_num        4440
income_total     4440
income_type      4440
edu_type         4440
family_type      4440
house_type       4440
DAYS_BIRTH       4440
DAYS_EMPLOYED    4440
FLAG_MOBIL       4440
work_phone       4440
phone            4440
email            4440
occyp_type       4440
family_size      4440
begin_month      4440
credit           4440
dtype: int64

In [8]:
# occyp_type이 Null인 것과 아닌것을 분류하는 작업

test = train[train['occyp_type']=='NaN']
train = train.drop(index = test.index)

test = test.drop('credit', axis = 1)
test = test.drop('occyp_type', axis = 1)
train = train.drop('credit', axis = 1)

train = train.reset_index()
test = test.reset_index()
display(train)
display(test)

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,31,5,1,0,0,1,Laborers,3.0,5.0
1,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,52,13,1,0,1,0,Managers,2.0,22.0
2,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,41,6,1,0,1,0,Sales staff,2.0,37.0
3,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,41,6,1,0,0,0,Managers,2.0,26.0
4,5,F,N,Y,2,270000.0,Working,Secondary / secondary special,Married,House / apartment,36,14,1,0,0,1,High skill tech staff,4.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22721,26451,F,N,Y,0,202500.0,Working,Higher education,Married,House / apartment,35,3,1,1,1,0,Accountants,2.0,44.0
22722,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,33,6,1,0,0,0,Core staff,4.0,2.0
22723,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,27,6,1,0,0,0,Core staff,2.0,25.0
22724,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,27,1,1,0,0,0,Laborers,1.0,59.0


Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,38,13,1,0,0,0,2.0,6.0
1,8,M,Y,Y,1,180000.0,Commercial associate,Higher education,Married,House / apartment,41,5,1,0,0,1,3.0,38.0
2,19,F,N,Y,0,180000.0,Working,Secondary / secondary special,Married,House / apartment,37,17,1,0,0,0,2.0,7.0
3,20,M,N,N,0,180000.0,Working,Secondary / secondary special,Married,House / apartment,54,5,1,0,1,0,2.0,35.0
4,23,M,Y,N,0,225000.0,Working,Secondary / secondary special,Married,Municipal apartment,39,2,1,1,1,1,2.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3726,26433,F,Y,N,0,121500.0,State servant,Higher education,Married,House / apartment,54,14,1,0,1,0,2.0,31.0
3727,26435,F,N,N,1,126000.0,Working,Secondary / secondary special,Widow,House / apartment,53,2,1,0,0,0,2.0,40.0
3728,26437,M,N,Y,0,180000.0,Working,Secondary / secondary special,Married,House / apartment,35,10,1,0,0,0,2.0,13.0
3729,26449,F,N,N,0,90000.0,Working,Secondary / secondary special,Married,House / apartment,28,7,1,1,1,0,2.0,2.0


In [9]:
# object 객체를 가지는 변수(열) 확인
object_col = []
for col in train.columns:
    if train[col].dtype == 'object' and col!='occyp_type': # 변수가 18개인 occyp_type만 제외
        object_col.append(col)
display(object_col)

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type']

In [10]:
# occyp_type을 제외한 모든 object객체 열을 onehot encoding

enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [11]:
# occyp_type을 label encoding

label_encoder=preprocessing.LabelEncoder()
train['occyp_type']=label_encoder.fit_transform(train['occyp_type'])

In [12]:
############################ test ###############################

In [13]:
# occyp_type을 제외한 모든 object객체 열을  onehot encoding
enc = OneHotEncoder()
enc.fit(test.loc[:,object_col])

test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [14]:
display(train)
display(test)

Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,...,family_type_Married,family_type_Separated,family_type_Single / not married,family_type_Widow,house_type_Co-op apartment,house_type_House / apartment,house_type_Municipal apartment,house_type_Office apartment,house_type_Rented apartment,house_type_With parents
0,1,1,247500.0,31,5,1,0,0,1,8,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,0,450000.0,52,13,1,0,1,0,10,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,0,202500.0,41,6,1,0,1,0,15,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,0,157500.0,41,6,1,0,0,0,10,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,2,270000.0,36,14,1,0,0,1,6,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22721,26451,0,202500.0,35,3,1,1,1,0,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
22722,26452,2,225000.0,33,6,1,0,0,0,3,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
22723,26454,0,292500.0,27,6,1,0,0,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
22724,26455,0,171000.0,27,1,1,0,0,0,8,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,...,family_type_Married,family_type_Separated,family_type_Single / not married,family_type_Widow,house_type_Co-op apartment,house_type_House / apartment,house_type_Municipal apartment,house_type_Office apartment,house_type_Rented apartment,house_type_With parents
0,0,0,202500.0,38,13,1,0,0,0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,8,1,180000.0,41,5,1,0,0,1,3.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,19,0,180000.0,37,17,1,0,0,0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,20,0,180000.0,54,5,1,0,1,0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,23,0,225000.0,39,2,1,1,1,1,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3726,26433,0,121500.0,54,14,1,0,1,0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3727,26435,1,126000.0,53,2,1,0,0,0,2.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3728,26437,0,180000.0,35,10,1,0,0,0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3729,26449,0,90000.0,28,7,1,1,1,0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
from sklearn.preprocessing import MinMaxScaler

MinMaxScaler_data = train.copy()

minMaxScaler = MinMaxScaler()
print(minMaxScaler.fit(MinMaxScaler_data))
train_data_minMaxScaled = minMaxScaler.transform(MinMaxScaler_data)

# MinMaxScaler_data - MinMaxScaler적용한 데이터 프레임
index = 0
for col in MinMaxScaler_data.columns :
    if col != 'occyp_type':
        MinMaxScaler_data[col] = train_data_minMaxScaled[:,index]
        index = index + 1
display(MinMaxScaler_data)

MinMaxScaler()


Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,...,family_type_Married,family_type_Separated,family_type_Single / not married,family_type_Widow,house_type_Co-op apartment,house_type_House / apartment,house_type_Municipal apartment,house_type_Office apartment,house_type_Rented apartment,house_type_With parents
0,0.000000,0.052632,0.142442,0.212766,0.113636,0.0,0.0,0.0,1.0,8,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.000038,0.000000,0.273256,0.659574,0.295455,0.0,0.0,1.0,0.0,10,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.000076,0.000000,0.113372,0.425532,0.136364,0.0,0.0,1.0,0.0,15,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.000113,0.000000,0.084302,0.425532,0.136364,0.0,0.0,0.0,0.0,10,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.000151,0.105263,0.156977,0.319149,0.318182,0.0,0.0,0.0,1.0,6,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22721,0.999811,0.000000,0.113372,0.297872,0.068182,0.0,1.0,1.0,0.0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
22722,0.999849,0.105263,0.127907,0.255319,0.136364,0.0,0.0,0.0,0.0,3,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
22723,0.999924,0.000000,0.171512,0.127660,0.136364,0.0,0.0,0.0,0.0,3,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22724,0.999962,0.000000,0.093023,0.127660,0.022727,0.0,0.0,0.0,0.0,8,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
from sklearn.model_selection import train_test_split

# 설명변수 X, 예측변수 y 할당
X = train[train.columns.difference(['occyp_type'])]
y = train['occyp_type']
print(X)
print(y)
# 설명변수 정규화
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit(X).transform(X)

x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=10)

       DAYS_BIRTH  DAYS_EMPLOYED  FLAG_MOBIL  begin_month  car_N  car_Y  \
0              31              5           1          5.0    1.0    0.0   
1              52             13           1         22.0    0.0    1.0   
2              41              6           1         37.0    1.0    0.0   
3              41              6           1         26.0    0.0    1.0   
4              36             14           1         18.0    1.0    0.0   
...           ...            ...         ...          ...    ...    ...   
22721          35              3           1         44.0    1.0    0.0   
22722          33              6           1          2.0    1.0    0.0   
22723          27              6           1         25.0    0.0    1.0   
22724          27              1           1         59.0    1.0    0.0   
22725          53              3           1          9.0    1.0    0.0   

       child_num  edu_type_Academic degree  edu_type_Higher education  \
0              1          

In [17]:
from sklearn import svm

# 모형 객체 생성 (kernel='rbf' 적용)
svm_model = svm.SVC(kernel='rbf')

# train data를 가지고 모형 학습
svm_model.fit(x_train, y_train)   

# test data를 가지고 y_hat을 예측 (분류) 
y_hat = svm_model.predict(x_valid)

In [35]:
#print(y_hat[0:10])
#print(y_valid[0:10])



count = 0
for i in range(len(y_hat)):
    if y_hat[i] == list(y_valid)[i]:
        count += 1
        
print(count)
len(y_hat)

round(count/len(y_hat),2) 

3578


0.52

In [19]:
# 모형 성능 평가 - Confusion Matrix 계산
from sklearn import metrics 
svm_matrix = metrics.confusion_matrix(y_valid, y_hat)  
print(svm_matrix)
print('\n')

# 모형 성능 평가 - 평가지표 계산
svm_report = metrics.classification_report(y_valid, y_hat)            
print(svm_report)

[[  56    1    0   83    0    0    0    0   23    0   40    6    0    0
     1   63    0    0    0]
 [   3   14    0    7    0    0    0    0   41    0    2    2    0    0
     0   51    0    0    0]
 [   0    1    0   10    0    0    0    0   40    0    2    1    0    0
     0   70    0    0    0]
 [  10    1    0  402   15    0    0    0  146    0   48   16    0    0
     1  159    0    0    0]
 [   0    0    0   12   91    0    0    0  292    0   69    2    0    0
     1    9    0    0    0]
 [   1    0    0    5    0    3    0    0    4    0    1    0    0    0
     0    4    0    0    0]
 [  11    0    0   78    4    0   22    0   92    0   33    9    0    0
     0   91    0    0    0]
 [   0    0    0    5    0    0    0    0    1    0    4    0    0    0
     0    2    0    0    0]
 [   9    1    0   71   46    0    0    0  882    0   81    5    0    0
     1  277    0    0    0]
 [   0    0    0    0    2    0    0    0   36    0    0    0    0    0
     0    3    0    0    0]
