In [3]:
# data import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display   

%matplotlib inline
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

adult = pd.read_csv(
    URL,
    
    # specify the file encoding
    encoding="utf-8",
    
    # specify the separator in the data
    sep=",",      # comma separated values
    
    # ignore spaces after the separator
    skipinitialspace=True,
    index_col=None,
    
    # use manual headers
    header=None,
    names=[
        "age", "workclass", "fnlwgt", "education", 
        "education-num", "marital-status", "occupation",
        "relationship", "race", "sex", "capital-gain", 
        "capital-loss", "hours-per-week", "native-country",
        "wage"
    ]
)

In [4]:
# categorical -> one hot encoding
# convert each categorical feature using one-hot encoding

obj_df = adult.select_dtypes(include=["object"]).drop("wage", axis=1)
int_df = adult.select_dtypes(include=["int64"])

onehot_obj = pd.get_dummies(obj_df, drop_first=True)# dummy trap(one variable can be induced)
design_adult = pd.concat([onehot_obj,int_df], axis=1)

display(design_adult)


Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,...,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,39,77516,13,2174,0,40
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,50,83311,13,0,0,13
2,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,38,215646,9,0,0,40
3,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,53,234721,7,0,0,40
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,28,338409,13,0,0,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,27,257302,12,0,0,38
32557,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,40,154374,9,0,0,40
32558,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,58,151910,9,0,0,40
32559,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,22,201490,9,0,0,20


In [5]:
# 속성(변수) 선택
X=design_adult
y=adult['wage']
y

0        <=50K
1        <=50K
2        <=50K
3        <=50K
4        <=50K
         ...  
32556    <=50K
32557     >50K
32558    <=50K
32559    <=50K
32560     >50K
Name: wage, Length: 32561, dtype: object

In [6]:
# # 설명 변수 데이터를 정규화(normalization)
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)

In [7]:
# train data 와 test data로 구분(7:3 비율)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10) 

print('train data 개수: ', X_train.shape)
print('test data 개수: ', X_test.shape)


train data 개수:  (22792, 100)
test data 개수:  (9769, 100)


In [8]:
# sklearn 라이브러리에서 SVM 분류 모형 가져오기
from sklearn import svm

# 모형 객체 생성 (kernel='rbf' 적용)
svm_model = svm.SVC(kernel='rbf')

# train data를 가지고 모형 학습
svm_model.fit(X_train, y_train)   

# test data를 가지고 y_hat을 예측 (분류) 
y_hat = svm_model.predict(X_test)

print(y_hat[0:10])
print(y_test.values[0:10])
print('\n')

['<=50K' '<=50K' '>50K' '>50K' '>50K' '<=50K' '<=50K' '>50K' '<=50K'
 '<=50K']
['<=50K' '<=50K' '>50K' '<=50K' '>50K' '<=50K' '<=50K' '>50K' '<=50K'
 '<=50K']




In [9]:
# 모형 성능 평가 - Confusion Matrix 계산
from sklearn import metrics 
svm_matrix = metrics.confusion_matrix(y_test, y_hat)  
print(svm_matrix)
print('\n')


print('train_score: ', svm_model.score(X_train, y_train))
print('Test_score: ', svm_model.score(X_test, y_test))

# 모형 성능 평가 - 평가지표 계산
svm_report = metrics.classification_report(y_test, y_hat)            
print(svm_report)

[[6985  438]
 [1040 1306]]


train_score:  0.8645577395577395
Test_score:  0.8487050875217524
              precision    recall  f1-score   support

       <=50K       0.87      0.94      0.90      7423
        >50K       0.75      0.56      0.64      2346

    accuracy                           0.85      9769
   macro avg       0.81      0.75      0.77      9769
weighted avg       0.84      0.85      0.84      9769

