In [163]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import warnings
import numpy as np
warnings.filterwarnings(action='ignore') 
know_train = [pd.read_csv(path) for path in sorted(glob('../train/*.csv'))]
from matplotlib import pyplot as plt

In [172]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [173]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [174]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.iloc[:, 1:-1], # ID제외
                        'y': df.iloc[:, -1]} 

In [202]:
data = {}
for year, df in zip(years, know_train):
    data[year] = df.drop('idx',axis=1)

In [115]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
for year in years:
    train_data[year]['X'] = minmax.fit_transform(train_data[year]['X'])
    train_data[year]['X'] = pd.DataFrame(train_data[year]['X'])

In [116]:
from sklearn.model_selection import train_test_split

split_data = {}
for year in tqdm(years):
    X_train,X_test,y_train,y_test = train_test_split(train_data[year]['X'],train_data[year]['y'],test_size=0.25, random_state=42)
    split_data[year] = {'X_train' : X_train,
                       'y_train' : y_train,
                       'X_test' : X_test,
                       'y_test' : y_test}

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 61.34it/s]


In [117]:
from sklearn.ensemble import RandomForestClassifier
rf_models = {}

for year in tqdm(years):
    model = RandomForestClassifier(n_estimators=100, random_state=123456, n_jobs=8)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    rf_models[year] = model

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:18<00:00,  4.50s/it]


In [118]:
from sklearn.metrics import f1_score
rf_f1 =[]
for year in tqdm(years):
    y_pred = rf_models[year].predict(split_data[year]['X_test'])
    rf_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
np.mean(rf_f1)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.60s/it]


0.5180098960447035

In [155]:
train_data['2017']['X']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,139,140,141,142,143,144,145,146,147,148
0,0.50,0.428571,0.50,0.428571,0.50,0.428571,0.75,0.571429,0.50,0.571429,...,0.0,0.478261,0.2,0.497934,0.0,0.0,0.5,0.057143,0.0,0.036667
1,0.75,0.714286,0.75,0.714286,0.50,0.571429,0.50,0.571429,0.50,0.571429,...,0.0,0.275362,0.6,0.022727,0.0,0.0,0.5,0.000000,0.0,0.040000
2,0.50,0.571429,0.50,0.571429,0.50,0.571429,1.00,0.857143,0.75,0.714286,...,0.0,0.449275,0.6,0.022727,0.0,0.0,0.5,0.057143,0.0,0.040000
3,0.50,0.428571,0.50,0.428571,0.50,0.714286,0.75,0.714286,0.75,0.857143,...,0.0,0.333333,0.6,0.995868,0.0,0.0,0.5,0.100000,0.0,0.058333
4,0.75,0.714286,0.50,0.571429,0.50,0.571429,0.75,0.714286,0.50,0.571429,...,0.0,0.463768,0.6,0.022727,0.0,0.0,0.5,0.057143,0.0,0.041667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9481,0.50,0.714286,0.25,0.571429,0.50,0.428571,0.25,0.285714,0.25,0.428571,...,0.0,0.449275,0.6,0.373967,0.0,0.0,0.5,0.074286,0.0,0.030000
9482,1.00,0.714286,1.00,0.714286,1.00,0.714286,0.50,0.571429,0.75,0.714286,...,0.0,0.260870,0.6,0.593664,0.0,0.0,0.5,0.057143,0.0,0.050000
9483,0.50,0.428571,0.75,0.857143,0.50,0.428571,0.75,0.714286,0.75,0.714286,...,0.0,0.188406,0.2,0.148760,0.0,0.5,1.0,0.038571,0.0,0.025000
9484,0.50,0.714286,0.50,0.714286,0.75,0.714286,0.50,0.571429,0.50,0.714286,...,1.0,0.304348,0.6,0.264463,0.0,0.0,0.5,0.097143,0.0,0.041667


In [89]:
for i in range(len(data['2017'])):
    if data['2017'].iloc[:,i].isin([0]).sum() > 0:
        print('name : ',data['2017'].columns[i] )
        #print(train_data['2017']['X'].iloc[:,i].value_counts())
        print(data['2017'].iloc[:,i].isin([0]).sum())

name :  idx
1
name :  aq1_2
585
name :  aq2_2
861
name :  aq3_2
843
name :  aq4_2
3118
name :  aq5_2
2282
name :  aq6_2
1676
name :  aq7_2
1537
name :  aq8_2
1606
name :  aq9_2
1396
name :  aq10_2
942
name :  aq11_2
2745
name :  aq12_2
1064
name :  aq13_2
1445
name :  aq14_2
1550
name :  aq15_2
764
name :  aq16_2
1871
name :  aq17_2
3232
name :  aq18_2
4106
name :  aq19_2
1909
name :  aq20_2
4612
name :  aq21_2
4408
name :  aq22_2
4536
name :  aq23_2
4874
name :  aq24_2
1532
name :  aq25_2
1900
name :  aq26_2
1086
name :  aq27_2
1699
name :  aq28_2
551
name :  aq29_2
1881
name :  aq30_2
2545
name :  aq31_2
1747
name :  aq32_2
2654
name :  aq33_2
1566
name :  aq34_2
1103
name :  aq35_2
2667
name :  aq36_2
2232
name :  aq37_2
1968
name :  aq38_2
2164
name :  aq39_2
2487
name :  aq40_2
4985
name :  aq41_2
3586
name :  bq4_1a
1
name :  bq4_1b
8502
name :  bq4_1c
9229
name :  bq5_1
4976
name :  bq5_2
1
name :  bq12_2
1630
name :  bq12_3
1343
name :  bq12_4
879
name :  bq19_1
1
name :  bq23


IndexError: single positional indexer is out-of-bounds

In [213]:
text={ '2017' : ['bq4_1a','bq4_1b','bq4_1c','bq5_2','bq19_1','bq30','bq31','bq32','bq33'],
     '2018' : ['bq4_1a','bq4_1b','bq4_1c','bq5_1','bq28_1','bq29','bq30','bq31','bq32','bq33',
              'bq37_1'],
     '2019' : ['bq4_1a','bq4_1b','bq4_1c','bq5_2','bq18_10','bq20_1','bq22','bq23','bq24','bq27_1',
              ],
     '2020' : ['bq4_1a','bq4_1b','bq4_1c','bq5_2','bq18_10','bq20_1',]}

In [127]:
text={ '2017' : ['bq19_1','bq30','bq31','bq32','bq33'],
     '2018' : ['bq5_1','bq28_1','bq29','bq30','bq31','bq32','bq33',
              'bq37_1'],
     '2019' : ['bq18_10','bq20_1','bq22','bq23','bq24','bq27_1',
              ],
     '2020' : ['bq18_10','bq20_1',]}

In [262]:
data = {}
for year, df in zip(years, know_train):
    data[year] = df.drop('idx',axis=1)

In [236]:
train_data = {}
for year in years:
    train_data[year] = {'X': data[year].iloc[:, 1:-1], # ID제외
                        'y': data[year].iloc[:, -1]} 

In [241]:
from sklearn.preprocessing import StandardScaler
minmax = StandardScaler()
for year in years:
    train_data[year]['X'] = minmax.fit_transform(train_data[year]['X'])
    train_data[year]['X'] = pd.DataFrame(train_data[year]['X'])

In [242]:
from sklearn.model_selection import train_test_split

split_data = {}
for year in tqdm(years):
    X_train,X_test,y_train,y_test = train_test_split(train_data[year]['X'],train_data[year]['y'],test_size=0.25, random_state=42)
    split_data[year] = {'X_train' : X_train,
                       'y_train' : y_train,
                       'X_test' : X_test,
                       'y_test' : y_test}

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 65.36it/s]


In [243]:
from sklearn.ensemble import RandomForestClassifier
rf_models = {}
rf_import ={}
for year in tqdm(years):
    model = RandomForestClassifier(n_estimators=100, random_state=123456, n_jobs=8)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    rf_models[year] = model
    rf_import[year] = model.feature_importances_

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:12<00:00,  3.14s/it]


In [244]:
from sklearn.metrics import f1_score
rf_f1 =[]
for year in tqdm(years):
    y_pred = rf_models[year].predict(split_data[year]['X_test'])
    rf_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
np.mean(rf_f1)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.19it/s]


0.5188052331762718

In [274]:
data['2017'].loc[:,:'aq41_2'].shape

(9486, 82)

In [263]:
data['2017'].iloc[:,0]

0       3
1       4
2       3
3       3
4       4
       ..
9481    3
9482    5
9483    3
9484    3
9485    3
Name: aq1_1, Length: 9486, dtype: int64

In [288]:
one_col = {'2017' : 'aq41_2',
          '2018' : 'iq6',
          '2019' : 'kq33_2',
          '2020' : 'vq13'}

In [311]:
one_df = {}
for key,val in one_col.items():
    for i in range(data[key].loc[:,:val].shape[1]):
        if i == 0 :
            test_df = pd.DataFrame(onehot_encoder.fit_transform(data[key].iloc[:,i].values.reshape(-1,1)).toarray())
        else:
            cat = pd.DataFrame(onehot_encoder.fit_transform(data[key].iloc[:,i].values.reshape(-1,1)).toarray())
            test_df = pd.concat([test_df,cat],axis=1)
            
        test_df = test_df.fillna(0)
        
    one_df[key] = pd.concat([test_df,data[key].loc[:,val:]],axis=1)

In [315]:
train_data = {}
for year in years:
    train_data[year] = {'X': one_df[year].iloc[:, 1:-1], # ID제외
                        'y': one_df[year].iloc[:, -1]} 

In [316]:
from sklearn.preprocessing import StandardScaler
minmax = StandardScaler()
for year in years:
    train_data[year]['X'] = minmax.fit_transform(train_data[year]['X'])
    train_data[year]['X'] = pd.DataFrame(train_data[year]['X'])

In [318]:
from sklearn.model_selection import train_test_split

split_data = {}
for year in tqdm(years):
    X_train,X_test,y_train,y_test = train_test_split(train_data[year]['X'],train_data[year]['y'],test_size=0.25, random_state=42)
    split_data[year] = {'X_train' : X_train,
                       'y_train' : y_train,
                       'X_test' : X_test,
                       'y_test' : y_test}

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 22.52it/s]


In [319]:
from sklearn.ensemble import RandomForestClassifier
rf_models = {}

for year in tqdm(years):
    model = RandomForestClassifier(n_estimators=100, random_state=123456, n_jobs=8)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    rf_models[year] = model
    

 50%|██████████████████████████████████████████                                          | 2/4 [00:13<00:13,  6.64s/it]


MemoryError: could not allocate 36634624 bytes

In [None]:
from sklearn.metrics import f1_score
rf_f1 =[]
for year in tqdm(years):
    y_pred = rf_models[year].predict(split_data[year]['X_test'])
    rf_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
np.mean(rf_f1)