## 임포트

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB   #가우시안 나이브 베이즈 참조 자료형
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [2]:
class CFG:
    SEED = 42

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## 데이터 불러오기

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
snp_info =pd.read_csv('snp_info.csv')
sample =pd.read_csv('sample_submission.csv')

In [29]:
train_A = train[train['class']== 'A']
train_B = train[train['class']== 'B']
train_C = train[train['class']== 'C']

train_A['trait'].value_counts()

1    69
Name: trait, dtype: int64

In [30]:
train_B['trait'].value_counts()

2    114
Name: trait, dtype: int64

In [31]:
train_C['trait'].value_counts()

2    79
Name: trait, dtype: int64

In [5]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [6]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

## Data Pre-processing
### Label-Encoding

In [7]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [8]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [9]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

LabelEncoder()

In [10]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

In [11]:
train_x

Unnamed: 0,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,0,0,0,2,5,1,0,4,2,0,0,5,0,5,1,0,0,0,0
1,0,0,0,2,1,1,2,0,0,1,0,4,0,1,0,4,5,0,0
2,0,0,0,2,5,5,0,4,3,5,0,4,4,1,0,0,0,0,0
3,0,0,0,1,0,5,0,4,0,5,5,0,5,1,5,5,5,0,5
4,0,0,0,2,5,5,3,0,3,0,0,0,0,5,0,0,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,0,0,0,2,1,1,0,4,3,1,0,4,0,5,1,4,0,0,0
258,0,0,0,2,5,0,2,0,0,1,4,4,0,1,1,0,1,0,4
259,0,0,0,1,1,5,0,4,0,1,5,4,4,0,5,5,5,2,5
260,0,0,0,1,0,5,0,4,0,5,5,0,4,1,1,4,5,2,5


In [18]:
train_y

array([1, 2, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, 2, 1, 1,
       0, 1, 2, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 2, 1, 0, 1, 1, 1, 1, 1, 0,
       2, 1, 1, 0, 2, 0, 0, 2, 0, 2, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 0, 2,
       2, 0, 1, 2, 2, 2, 0, 1, 1, 1, 2, 1, 2, 0, 1, 1, 2, 0, 2, 0, 2, 2,
       2, 1, 1, 1, 2, 2, 1, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 2, 0, 0, 1, 1, 0, 2, 2, 2, 0,
       1, 2, 0, 1, 2, 0, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 0, 1, 0, 2, 2, 2,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2, 2,
       2, 0, 2, 0, 0, 2, 0, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2,
       1, 2, 0, 1, 0, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 0, 1, 2, 0, 0, 2, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 2, 0, 0, 2,
       2, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, 1, 1, 1, 2, 0, 0, 1])

## Model Fit

In [None]:
clf = RandomForestClassifier(random_state=CFG.SEED)
clf.fit(train_x, train_y)

## Inference

In [None]:
preds = clf.predict(test_x)
print(preds)

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['class'] = class_le.inverse_transform(preds)

In [None]:
submit.to_csv('./answer/submit.csv', index=False)

In [None]:
# 학습데이터로 모델을 학습합니다
model = RandomForestClassifier()
model.fit(train_x, train_y)

# 테스트 데이터로 모델을 테스트합니다.
predicted = model.predict(test_x)

In [None]:
predicted

In [None]:
submit['class'] = class_le.inverse_transform(predicted)

In [None]:
submit.to_csv('./answer/submit_rf.csv', index=False)