In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import warnings
import numpy as np
warnings.filterwarnings(action='ignore') 
know_train = [pd.read_csv(path) for path in sorted(glob('../train/*.csv'))]
from matplotlib import pyplot as plt

In [2]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [3]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [154]:
data = {}
for year, df in zip(years, know_train):
    data[year] = df.drop('idx',axis=1)

In [140]:
bining = {'2017' : ['bq37','bq41_1','bq41_2','bq41_3'] ,
         '2018' : ['bq36','bq41_1','bq41_2','bq41_3'],
         '2019' : ['bq26','bq31_1','bq31_2','bq31_3'],
         '2020' : ['bq25','bq30_1','bq30_2','bq30_3']}
#연령 bining
# 19세 이상 35세이하 '청년'
# 36세이상 65세 미만 '중년'
# 65세 이상 '노년'
for year in years:
    if year == '2017':
        age = 'bq37'
    elif year == '2018':
        age = 'bq36'
    elif year == '2019':
        age = 'bq26'
    else:
        age = 'bq25'
    data[year].loc[(data[year][age] <=35),'young'] =1
    data[year].loc[(data[year][age] > 35),'middle'] =1
    data[year].loc[(data[year][age] >=65),'old'] =1
    data[year].fillna(0,inplace=True)
    #data[year].drop(age,axis=1,inplace=True)

In [141]:
for year in years:
    if year == '2017':
        first = 'bq41_1' 
        second = 'bq41_2'
        thrid = 'bq41_3'
    elif year == '2018':
        first = 'bq41_1'
        second = 'bq41_2'
        thrid = 'bq41_3'
    elif year == '2019':
        first = 'bq31_1'
        second = 'bq31_2'
        thrid = 'bq31_3'
    else:
        first = 'bq30_1'
        second = 'bq30_2'
        thrid = 'bq30_3'
    Q_3 = np.quantile(data[year][first],.75)
    data[year].loc[(data[year][first] <= Q_3 ) , '1LS'] = 1
    data[year].loc[data[year][first] > Q_3 , '1HS'] = 1
    data[year].loc[data[year][first] == 0  , '1NA'] = 1
    
    Q_3 = np.quantile(data[year][second],.75)
    data[year].loc[(data[year][second] <= Q_3 ) , '2LS'] = 1
    data[year].loc[data[year][second] > Q_3 , '2HS'] = 1
    data[year].loc[data[year][second] == 0  , '2NA'] = 1
    
    Q_3 = np.quantile(data[year][thrid],.75)
    data[year].loc[(data[year][thrid] <= Q_3 ) , '3LS'] = 1
    data[year].loc[data[year][thrid] > Q_3 , '3HS'] = 1
    data[year].loc[data[year][thrid] == 0  , '3NA'] = 1
    
   
    
    data[year].fillna(0,inplace=True)
    #data[year].drop([first,second,thrid],axis=1,inplace=True)

In [155]:
train_data = {}
for year in years:
    train_data[year] = {'X': data[year].drop('knowcode',axis=1), # ID제외
                        'y': data[year]['knowcode']} 

In [158]:
from sklearn.preprocessing import StandardScaler
minmax = StandardScaler()
X = {}
y ={}
for year in years:
    train_data[year]['X'] = minmax.fit_transform(train_data[year]['X'])
    train_data[year]['X'] = pd.DataFrame(train_data[year]['X'])
    X[year] = train_data[year]['X'] 
    y[year] = train_data[year]['y']

In [161]:
from imblearn.combine import *
X_samp = {}
y_samp = {}
for year in years:
    X_samp[year],y_samp[year] = SMOTEENN(random_state=42).fit_resample(X[year],y[year])

KeyboardInterrupt: 

In [143]:
from sklearn.model_selection import train_test_split

split_data = {}
for year in tqdm(years):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=42)
    split_data[year] = {'X_train' : X_train,
                       'y_train' : y_train,
                       'X_test' : X_test,
                       'y_test' : y_test}

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 68.74it/s]


In [144]:
from sklearn.ensemble import RandomForestClassifier
rf_models = {}

for year in tqdm(years):
    model = RandomForestClassifier(n_estimators=100, random_state=123456, n_jobs=8)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    rf_models[year] = model
    

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:10<00:00,  2.61s/it]


In [145]:
from sklearn.metrics import f1_score,accuracy_score
rf_f1 =[]
rf_ac = []
for year in tqdm(years):
    y_pred = rf_models[year].predict(split_data[year]['X_test'])
    rf_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
    rf_ac.append(accuracy_score(split_data[year]['y_test'],y_pred))
np.mean(rf_f1)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.33it/s]


0.5155072992376142

In [162]:
rf_ac

[0.4869308600337268,
 0.5582010582010583,
 0.5063113604488079,
 0.5105859182668636]