## import

In [1]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn import tree # 의사결정트리
from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#sklearn 모델의 동일한 결과 출력을 위해 선언합니다.
np.random.seed(42)

## load data

In [2]:
train = pd.read_csv('./train.csv') #데이터 불러오기
y_label = train['class']

train_df = pd.DataFrame(train)
train_df['class'] = y_label

train_df.head(4)

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,0,0,0,2,G G,A G,A A,G A,C A,...,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,0,0,0,2,A G,A G,C A,A A,A A,...,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,0,0,0,2,G G,G G,A A,G A,C C,...,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,0,0,0,1,A A,G G,A A,G A,A A,...,G G,A A,G G,A G,G G,G G,G G,A A,G G,A


In [3]:
# 필요없는 식별자 제거

train_df.drop(['id','father','mother','gender'], axis=1, inplace = True)

In [4]:
print(train_df['class'].value_counts())

B    114
C     79
A     69
Name: class, dtype: int64


### 훈련용에서 훈련 / 테스트 나누기

In [5]:
# 훈련용 데이터와 테스트 데이터 8:2 비율로 나누기
X_train, X_valid, y_train, y_valid = train_test_split(train_df, y_label, test_size = 0.2, random_state = 42)

print(X_train.shape, X_valid.shape)

(209, 17) (53, 17)


## 라벨 - 인코딩

In [6]:
X_train = train_df[['trait', 'SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09',
                    'SNP_10', 'SNP_11', 'SNP_12', 'SNP_13', 'SNP_14', 'SNP_15',]]
y_train = train_df[['class']]

In [7]:
X_valid = train_df[['trait', 'SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09',
                    'SNP_10', 'SNP_11', 'SNP_12', 'SNP_13', 'SNP_14', 'SNP_15',]]
y_valid = train_df[['class']]

In [8]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [9]:
snp_data = []
for col in snp_col:
    snp_data += list(X_valid[col].values)

In [10]:
y_train = class_le.fit_transform(y_train)
snp_le.fit(snp_data)

LabelEncoder()

In [11]:
for col in X_train.columns:
    if col in snp_col:
        X_train[col] = snp_le.transform(X_train[col])
        X_valid[col] = snp_le.transform(X_valid[col])

## Model Fit

In [12]:
#학습데이터로 모델 학습
model = tree.DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [13]:
preds = model.predict(X_valid)
print('done')

done


In [14]:
#submit = pd.read_csv('./sample_submission.csv')

In [15]:
#submit['class'] = class_le.inverse_transform(preds)

In [16]:
#submit.to_csv('./answer/submit_dtree.csv', index=False)

In [17]:
preds

array([1, 2, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, 2, 1, 1,
       0, 1, 2, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 2, 1, 0, 1, 1, 1, 1, 1, 0,
       2, 1, 1, 0, 2, 0, 0, 2, 0, 2, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 0, 2,
       2, 0, 1, 2, 2, 2, 0, 1, 1, 1, 2, 1, 2, 0, 1, 1, 2, 0, 2, 0, 2, 2,
       2, 1, 1, 1, 2, 2, 1, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 2, 0, 0, 1, 1, 0, 2, 2, 2, 0,
       1, 2, 0, 1, 2, 0, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 0, 1, 0, 2, 2, 2,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2, 2,
       2, 0, 2, 0, 0, 2, 0, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2,
       1, 2, 0, 1, 0, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 0, 1, 2, 0, 0, 2, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 2, 0, 0, 2,
       2, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, 1, 1, 1, 2, 0, 0, 1])

In [18]:
accuracy_score(y_train, preds) 

1.0