In [None]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
import random
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [None]:
# 데이터 불러오기
train = pd.read_csv('./train.csv')
train = train.drop(['index'], axis=1)

test = pd.read_csv('./test.csv')
test = test.drop(['index'], axis=1)

submit = pd.read_csv('./sample_submission.csv')

In [None]:
# 결측치 확인
print('훈련데이터 결측치 합 \n', train.isnull().sum())
print('테스트데이터 결측치 합 \n', test.isnull().sum())

In [None]:
def days_to_age(x):
    return (x*-1)/365

# train
train['DAYS_BIRTH'] = train['DAYS_BIRTH'].apply(days_to_age)
train['DAYS_BIRTH'] = train['DAYS_BIRTH'].astype({'DAYS_BIRTH':'int'})
display(train['DAYS_BIRTH'])

# test
test['DAYS_BIRTH'] = test['DAYS_BIRTH'].apply(days_to_age)
test['DAYS_BIRTH'] = test['DAYS_BIRTH'].astype({'DAYS_BIRTH':'int'})
display(test['DAYS_BIRTH'])

In [None]:
def days_to_year(x):
    if x==365243 :
        return 0
    return (x*-1)/365+1

# train
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(days_to_year)
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].astype({'DAYS_EMPLOYED':'int'})
display(train['DAYS_EMPLOYED'])

# test
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].apply(days_to_year)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].astype({'DAYS_EMPLOYED':'int'})
display(test['DAYS_EMPLOYED'])

In [None]:
def minus(x):
    return x * -1

# train
train['begin_month'] = train['begin_month'].apply(minus)
display(train['begin_month'])

# test
test['begin_month'] = test['begin_month'].apply(minus)
display(test['begin_month'])

In [None]:
# income_type이 pension이면서 occyp_type이 Null인 
#train.drop('credit', axis = 1, inplace = True)
#test.drop('credit', axis = 1, inplace = True)

cond = (train['income_type'] == 'Pensioner')
train['occyp_type'] = train['occyp_type'].fillna(cond.map({True:'Retired', False: 'NaN'}))
train[(train['occyp_type'] == 'Retired')].count()

In [None]:
# occyp_type이 Null인 것과 아닌것을 분류하는 작업

test = train[train['occyp_type']=='NaN']
train = train.drop(index = test.index)

test = test.drop('credit', axis = 1)
test = test.drop('occyp_type', axis = 1)
train = train.drop('credit', axis = 1)

train = train.reset_index()
test = test.reset_index()
display(train)
display(test)

In [None]:
# object 객체를 가지는 변수(열) 확인
object_col = []
for col in train.columns:
    if train[col].dtype == 'object' and col!='occyp_type': # 변수가 18개인 occyp_type만 제외
        object_col.append(col)
display(object_col)

In [None]:
# occyp_type을 제외한 모든 object객체 열을 onehot encoding

enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [None]:
# occyp_type을 label encoding

label_encoder=preprocessing.LabelEncoder()
train['occyp_type']=label_encoder.fit_transform(train['occyp_type'])

In [None]:
############################ test ###############################

In [None]:
# occyp_type을 제외한 모든 object객체 열을  onehot encoding
enc = OneHotEncoder()
enc.fit(test.loc[:,object_col])

test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [None]:
display(train)
display(test)

In [None]:
from sklearn.preprocessing import MinMaxScaler

MinMaxScaler_data = train.copy()

minMaxScaler = MinMaxScaler()
print(minMaxScaler.fit(MinMaxScaler_data))
train_data_minMaxScaled = minMaxScaler.transform(MinMaxScaler_data)

# MinMaxScaler_data - MinMaxScaler적용한 데이터 프레임
index = 0
for col in MinMaxScaler_data.columns :
    if col != 'occyp_type':
        MinMaxScaler_data[col] = train_data_minMaxScaled[:,index]
        index = index + 1
display(MinMaxScaler_data)

In [None]:
from sklearn.model_selection import train_test_split

# 설명변수 X, 예측변수 y 할당
X = train[train.columns.difference(['occyp_type'])]
y = train['occyp_type']
print(X)
print(y)
# 설명변수 정규화
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit(X).transform(X)

x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
from sklearn import svm

# 모형 객체 생성 (kernel='rbf' 적용)
svm_model = svm.SVC(kernel='rbf')

# train data를 가지고 모형 학습
svm_model.fit(x_train, y_train)   

# test data를 가지고 y_hat을 예측 (분류) 
y_hat = svm_model.predict(x_valid)

In [None]:
#print(y_hat[0:10])
#print(y_valid[0:10])



count = 0
for i in range(len(y_hat)):
    if y_hat[i] == list(y_valid)[i]:
        count += 1
        
print(count)
len(y_hat)

round(count/len(y_hat),2) 

In [None]:
# 모형 성능 평가 - Confusion Matrix 계산
from sklearn import metrics 
svm_matrix = metrics.confusion_matrix(y_valid, y_hat)  
print(svm_matrix)
print('\n')

# 모형 성능 평가 - 평가지표 계산
svm_report = metrics.classification_report(y_valid, y_hat)            
print(svm_report)