In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import warnings
import numpy as np
warnings.filterwarnings(action='ignore') 
know_train = [pd.read_csv(path) for path in sorted(glob('../train/*.csv'))]

In [2]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [3]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [4]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.iloc[:, 1:-1], # ID제외
                        'y': df.iloc[:, -1]} 

In [5]:
from sklearn.preprocessing import StandardScaler
minmax = StandardScaler()
for year in years:
    train_data[year]['X'] = minmax.fit_transform(train_data[year]['X'])
    train_data[year]['X'] = pd.DataFrame(train_data[year]['X'])

In [6]:
from sklearn.model_selection import train_test_split

split_data = {}
for year in tqdm(years):
    X_train,X_test,y_train,y_test = train_test_split(train_data[year]['X'],train_data[year]['y'],test_size=0.25, random_state=42)
    split_data[year] = {'X_train' : X_train,
                       'y_train' : y_train,
                       'X_test' : X_test,
                       'y_test' : y_test}

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 88.59it/s]


In [7]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [8]:

xg_models = {}

for year in tqdm(years):
    model = XGBClassifier(random_state=1,eval_metric = 'mlogloss')
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    xg_models[year] = model

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [14:06<00:00, 211.67s/it]


In [9]:
from sklearn.metrics import f1_score
xg_f1 =[]
for year in tqdm(years):
    y_pred = xg_models[year].predict(split_data[year]['X_test'])
    xg_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
np.mean(xg_f1)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.44it/s]


0.5355550670591046

In [None]:


lg_models = {}

for year in tqdm(years):
    model = LGBMClassifier(random_state=1,objective='mlogloss')
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    lg_models[year] = model

In [None]:
from sklearn.metrics import f1_score
lg_f1 =[]
for year in tqdm(years):
    y_pred = lg_models[year].predict(split_data[year]['X_test'])
    lg_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
np.mean(lg_f1)