## Import

In [1]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split   # 분리할때 사용하는 모듈
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

In [2]:
class CFG:
    SEED = 42

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Data Load 데이터 불러오기

In [4]:
train = pd.read_csv('open/train.csv')
test = pd.read_csv('open/test.csv')

# 데이터 크기 확인
print(train.shape)
# 데이터 상단 출력
display(train.head())

# 데이터 크기 확인
print(test.shape)
# 데이터 상단 출력
display(test.head())

(262, 21)


Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,0,0,0,2,G G,A G,A A,G A,C A,...,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,0,0,0,2,A G,A G,C A,A A,A A,...,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,0,0,0,2,G G,G G,A A,G A,C C,...,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,TRAIN_003,0,0,0,1,A A,G G,A A,G A,A A,...,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,TRAIN_004,0,0,0,2,G G,G G,C C,A A,C C,...,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


(175, 20)


Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,TEST_000,0,0,0,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A G,A G,G A,G G,C A,G A
1,TEST_001,0,0,0,2,G G,A G,C C,G G,C C,A A,A A,A A,A A,G G,A G,A A,A A,A A,A A
2,TEST_002,0,0,0,2,G G,A G,A A,A A,C A,A G,A A,A A,A A,A G,A A,G A,G G,A A,G G
3,TEST_003,0,0,0,2,G G,A G,C A,A A,C C,A A,A A,A A,A A,G G,A A,G A,A G,A A,A A
4,TEST_004,0,0,0,1,A A,G G,A A,G G,A A,G G,G G,A A,G G,A G,G G,G A,G G,A A,G G


In [5]:

# 데이터 기초통계량 확인
display(train.describe())


# 데이터 기초통계량 확인
display(test.describe())

Unnamed: 0,father,mother,gender,trait
count,262.0,262.0,262.0,262.0
mean,0.0,0.0,0.0,1.736641
std,0.0,0.0,0.0,0.441298
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


Unnamed: 0,father,mother,gender,trait
count,175.0,175.0,175.0,175.0
mean,0.0,0.0,0.0,1.708571
std,0.0,0.0,0.0,0.455724
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,2.0
75%,0.0,0.0,0.0,2.0
max,0.0,0.0,0.0,2.0


In [6]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [7]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)
train_y # class 값을 보여준다.

0      B
1      C
2      B
3      A
4      C
      ..
257    B
258    C
259    A
260    A
261    B
Name: class, Length: 262, dtype: object

## Data Pre-processing 데이터 전처리
### Label-Encoding 레이블 인코딩

In [8]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [9]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [10]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

LabelEncoder()

In [11]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

## Model Fit

In [12]:
# 학습데이터로 clf을 학습합니다.
clf = GaussianNB()
clf.fit(train_x, train_y)

GaussianNB()

## Inference

In [13]:
# 테스트 데이터로 clf을 테스트합니다.
preds = clf.predict(test_x) # 예측치
print(preds)

[0 1 2 1 0 1 2 1 0 0 2 1 1 0 1 1 0 1 1 2 1 1 1 0 1 1 1 0 0 1 0 0 1 2 0 1 2
 1 1 2 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 2 0 1 2 0 1 2 2 1 0 1 0 0 1 1 1 0 0
 2 1 2 1 1 1 2 1 0 1 1 1 1 1 1 0 1 1 1 1 1 2 0 1 0 2 0 1 1 1 0 0 2 1 0 1 2
 1 1 1 1 0 0 2 1 1 0 1 1 2 2 1 2 1 0 1 0 0 1 1 1 2 0 0 1 0 0 0 2 1 1 1 0 1
 1 0 0 1 0 1 1 0 0 1 1 0 1 2 1 1 0 0 2 1 1 0 1 2 1 1 1]


## Submission

In [14]:
submit = pd.read_csv('open/sample_submission.csv')

In [15]:
submit['class'] = class_le.inverse_transform(preds)

In [16]:
submit.to_csv('./answer/submit.csv', index=False)