## Import

In [75]:
import pandas as pd
import random
import os
import numpy as np
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

In [76]:
class CFG:
    SEED = 42

In [77]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Data Load

In [78]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [79]:
train=train.drop(['father','mother','gender'],axis=1)
test=test.drop(['father','mother','gender'],axis=1)

In [80]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [81]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

## Data Pre-processing
### Label-Encoding

In [82]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [83]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [84]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

In [85]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

In [86]:

train_x=pd.get_dummies(train_x,columns=train_x.columns)

test_x=pd.get_dummies(test_x,columns=test_x.columns)

In [87]:
train_x

Unnamed: 0,trait_1,trait_2,SNP_01_0,SNP_01_1,SNP_01_5,SNP_02_0,SNP_02_1,SNP_02_5,SNP_03_0,SNP_03_2,...,SNP_12_5,SNP_13_0,SNP_13_1,SNP_13_5,SNP_14_0,SNP_14_2,SNP_14_3,SNP_15_0,SNP_15_4,SNP_15_5
0,0,1,0,0,1,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
1,0,1,0,1,0,0,1,0,0,1,...,0,0,0,1,1,0,0,1,0,0
2,0,1,0,0,1,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
3,1,0,1,0,0,0,0,1,1,0,...,1,0,0,1,1,0,0,0,0,1
4,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,0,1,0,1,0,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
258,0,1,0,0,1,1,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0
259,1,0,0,1,0,0,0,1,1,0,...,1,0,0,1,0,1,0,0,0,1
260,1,0,1,0,0,0,0,1,1,0,...,0,0,0,1,0,1,0,0,0,1


In [88]:
test_x

Unnamed: 0,trait_1,trait_2,SNP_01_0,SNP_01_1,SNP_01_5,SNP_02_0,SNP_02_1,SNP_02_5,SNP_03_0,SNP_03_2,...,SNP_12_5,SNP_13_0,SNP_13_1,SNP_13_5,SNP_14_0,SNP_14_2,SNP_14_3,SNP_15_0,SNP_15_4,SNP_15_5
0,1,0,0,1,0,0,0,1,1,0,...,0,0,0,1,0,1,0,0,1,0
1,0,1,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,0,1,0,0,1,0,1,0,1,0,...,0,0,0,1,1,0,0,0,0,1
3,0,1,0,0,1,0,1,0,0,1,...,0,0,1,0,1,0,0,1,0,0
4,1,0,1,0,0,0,0,1,1,0,...,0,0,0,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,0,1,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
171,0,1,0,0,1,1,0,0,1,0,...,0,0,1,0,1,0,0,0,1,0
172,0,1,0,0,1,1,0,0,1,0,...,0,0,1,0,1,0,0,0,0,1
173,0,1,0,1,0,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0


## Model Fit

In [89]:
clf = XGBClassifier(random_state=CFG.SEED)
clf.fit(train_x, train_y)

## Inference

In [92]:
preds = clf.predict(test_x)
print(preds)

[0 1 2 1 0 1 2 1 0 0 2 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 0 0 1 2 0 1 2
 1 1 2 0 1 2 1 1 1 1 2 1 2 0 1 0 1 1 1 2 0 1 2 0 1 2 2 2 0 1 0 0 1 1 1 0 0
 2 1 2 1 1 1 2 1 0 1 1 1 1 1 2 0 1 1 2 1 1 2 0 1 0 2 0 1 1 2 0 0 2 1 0 1 2
 1 1 1 1 0 0 2 1 1 0 1 1 2 2 1 2 1 0 1 0 0 1 1 1 2 0 0 1 0 0 0 2 1 1 1 0 1
 2 0 0 1 0 1 1 0 0 1 2 0 1 2 2 1 0 0 2 1 1 0 1 2 2 1 1]


## Submission

In [93]:
submit = pd.read_csv('./sample_submission.csv')

In [94]:
submit['class'] = class_le.inverse_transform(preds)

In [95]:
submit.to_csv('./submit.csv', index=False)