In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import warnings
import numpy as np
warnings.filterwarnings(action='ignore') 
know_train = [pd.read_csv(path) for path in sorted(glob('../train/*.csv'))]
from matplotlib import pyplot as plt

In [2]:
know_train[0]

Unnamed: 0,idx,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,...,bq37,bq38,bq38_1,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3,knowcode
0,0,3,3,3,3,3,3,4,4,3,...,52,2,실업,1,1,1,4000,,2200,825101
1,1,4,5,4,5,3,4,3,4,3,...,38,4,건축공학,1,1,1,,,2400,140204
2,2,3,4,3,4,3,4,5,6,4,...,50,4,건축공학,1,1,1,4000,,2400,140204
3,3,3,3,3,3,3,5,4,5,4,...,42,4,환경학과,1,1,1,7000,,3500,140601
4,4,4,5,3,4,3,4,4,5,3,...,51,4,건축공학,1,1,1,4000,,2500,140204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9481,9481,3,5,2,4,3,3,2,2,2,...,50,4,산업디자인,1,1,1,5200,,1800,411301
9482,9482,5,5,5,5,5,5,3,4,4,...,37,4,우주항공공학,1,1,1,4000,,3000,151105
9483,9483,3,3,4,6,3,3,4,5,4,...,32,2,기계,1,4,2,2700,,1500,701101
9484,9484,3,5,3,5,4,5,3,4,3,...,40,4,문헌정보학,1,1,1,6800,,2500,25402


In [3]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [4]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [62]:
data = {}
for year, df in zip(years, know_train):
    data[year] = df.drop('idx',axis=1)
    dr = data[year].loc[data[year].knowcode == 9999999].index
    data[year] = data[year].drop(dr)

In [63]:
train_data = {}
for year in years:
    train_data[year] = {'X': data[year].iloc[:, 0:-1], # ID제외
                        'y': data[year].iloc[:, -1]} 

In [64]:
from sklearn.preprocessing import StandardScaler
minmax = StandardScaler()
for year in years:
    train_data[year]['X'] = minmax.fit_transform(train_data[year]['X'])
    train_data[year]['X'] = pd.DataFrame(train_data[year]['X'])

In [55]:

from imblearn.over_sampling import SMOTE

sm = SMOTE(k_neighbors=6 )

for year in years:
    train_data[year]['X'],train_data[year]['y'] = sm.fit_resample(train_data[year]['X'],list(train_data[year]['y']))

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 4, n_neighbors = 7

In [65]:
from sklearn.model_selection import train_test_split

split_data = {}
for year in tqdm(years):
    X_train,X_test,y_train,y_test = train_test_split(train_data[year]['X'],train_data[year]['y'],test_size=0.25, random_state=42)
    split_data[year] = {'X_train' : X_train,
                       'y_train' : y_train,
                       'X_test' : X_test,
                       'y_test' : y_test}

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 63.78it/s]


In [None]:
from sklearn.metrics  import f1_score,make_scorer
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import make_pipeline, Pipeline
from optuna.pruners import SuccessiveHalvingPruner
import optuna
year_param = {}
for year in tqdm(years):
    def objective(trial: optuna.trial.Trial):

        params = {
            'n_estimators' : trial.suggest_int('n_estimators', 0, 1000),
            'max_depth' : trial.suggest_int('max_depth', 1, 50),
            'min_samples_split' : trial.suggest_int('min_samples_split',1,150),
            'min_samples_leaf' : trial.suggest_int('min_samples_leaf',1,60)
        }
        smote_neighbors = trial.suggest_int('smoth_n_neighbors', 1, 10)
        #sampler = SMOTE(random_state=42, k_neighbors=smote_neighbors)
        clf = RandomForestClassifier(random_state=42, **params)
        #pipeline = make_pipeline(sampler, clf)
        scores = cross_val_score(clf, split_data[year]['X_train'], 
                                split_data[year]['y_train'], verbose=1 , n_jobs=-1, 
                                 cv=4)
        return scores.mean()

    studies = optuna.create_study(direction='maximize', pruner=SuccessiveHalvingPruner())
    studies.optimize(objective, n_trials=50)

    print(f'===== Done fold =====')
    print(studies.best_value)
    print(studies.best_params)
    year_param[year] = studies.best_params

  0%|                                                                                            | 0/4 [00:00<?, ?it/s][32m[I 2021-12-09 20:34:23,977][0m A new study created in memory with name: no-name-a93a5368-787a-4a83-89cf-c9c182b8b123[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   24.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   24.7s finished
[32m[I 2021-12-09 20:34:49,293][0m Trial 0 finished with value: 0.229203280609256 and parameters: {'n_estimators': 662, 'max_depth': 13, 'min_samples_split': 106, 'min_samples_leaf': 40, 'smoth_n_neighbors': 1}. Best is trial 0 with value: 0.229203280609256.[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    5.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    5.4s finished
[32m[I 2021-12-09 20:34:55,369]

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.7min finished
[33m[W 2021-12-09 20:50:30,083][0m Trial 18 failed, because the objective function returned nan.[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.1s finished
[33m[W 2021-12-09 20:50:31,881][0m Trial 19 failed, because the objective function returned nan.[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [60]:
from sklearn.ensemble import RandomForestClassifier
rf_models = {}

for year in tqdm(years):
    model = RandomForestClassifier(n_estimators=100, random_state=123456, n_jobs=8)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    rf_models[year] = model

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:14<00:00,  3.55s/it]


In [61]:
from sklearn.metrics import f1_score,accuracy_score,recall_score,confusion_matrix
rf_f1 =[]
rf_ac = []
rf_rc = []
for year in tqdm(years):
    y_pred = rf_models[year].predict(split_data[year]['X_test'])
    rf_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
    rf_ac.append(accuracy_score(split_data[year]['y_test'],y_pred))
    rf_rc.append(recall_score(split_data[year]['y_test'],y_pred,average ='micro'))
    print(confusion_matrix(split_data[year]['y_test'],y_pred))
np.mean(rf_f1)

 25%|█████████████████████                                                               | 1/4 [00:00<00:02,  1.07it/s]

[[4 0 0 ... 0 0 0]
 [1 4 0 ... 0 0 0]
 [0 0 3 ... 0 0 0]
 ...
 [0 0 0 ... 2 0 1]
 [0 0 0 ... 0 3 0]
 [0 0 0 ... 0 0 2]]


 50%|██████████████████████████████████████████                                          | 2/4 [00:01<00:01,  1.21it/s]

[[6 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 7]]


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:02<00:00,  1.33it/s]

[[3 0 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 3 0 0]
 [0 0 0 ... 0 1 1]
 [0 0 0 ... 0 0 5]]


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.38it/s]

[[4 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 5 0]
 [0 0 0 ... 0 0 2]]





0.526141111332974

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)


In [46]:
rf_ac

[0.5166959578207382, 0.558641975308642, 0.5151940158952781, 0.5140324963072378]

In [47]:
rf_rc

[0.5166959578207382, 0.558641975308642, 0.5151940158952781, 0.5140324963072378]

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix()

In [12]:
know_test = [pd.read_csv(path) for path in sorted(glob('../test/*.csv'))]
know_test[0].head() # 2017년도 test 샘플

Unnamed: 0,idx,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,...,bq36,bq37,bq38,bq38_1,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3
0,0,3,4,2,2,3,3,1,,3,...,2,26,3,비서학,1,1,1,3000,,2300
1,1,5,5,3,5,5,5,5,5.0,4,...,1,57,4,농화학,1,1,1,5500,,2500
2,2,5,5,5,4,5,4,1,,1,...,1,31,4,신문방송,1,1,1,4300,,4000
3,3,4,5,5,6,4,6,3,4.0,4,...,1,35,6,화학,1,1,1,4100,,3000
4,4,5,6,4,5,4,5,1,,1,...,1,36,4,광고홍보,1,1,1,2800,,2000


In [13]:
for df in know_test:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [14]:
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns:
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) # train set에서 보지못한 카테고리변수 -1(UNK) 처리
            

2017
2018
2019
2020


In [15]:
test_data = {}
for year, df in zip(years, know_test):
    test_data[year] =  {'X': df.iloc[:,1:]}

In [16]:
test_data['2017']['X']

Unnamed: 0,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,aq5_2,...,bq36,bq37,bq38,bq38_1,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3
0,3,4,2,2,3,3,1,0,3,5,...,2,26,3,497,1,1,1,3000,0,2300
1,5,5,3,5,5,5,5,5,4,5,...,1,57,4,287,1,1,1,5500,0,2500
2,5,5,5,4,5,4,1,0,1,0,...,1,31,4,705,1,1,1,4300,0,4000
3,4,5,5,6,4,6,3,4,4,5,...,1,35,6,1423,1,1,1,4100,0,3000
4,5,6,4,5,4,5,1,0,1,0,...,1,36,4,141,1,1,1,2800,0,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9481,3,4,4,5,4,5,5,6,4,5,...,1,52,1,1,2,6,0,0,3400,0
9482,4,5,4,6,5,6,5,6,5,6,...,1,48,5,-1,1,1,1,7000,0,2400
9483,3,2,1,0,2,1,3,3,1,0,...,1,44,2,958,2,6,0,0,4500,0
9484,4,5,3,4,3,4,1,0,1,0,...,1,44,4,1261,1,1,1,6000,0,4000


In [17]:
train_data['2017']['X']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,144,145,146,147,148,149,150,151,152,153
0,-0.228321,-0.538776,-0.092564,-0.436936,-0.104670,-0.461585,1.250898,0.669020,0.363114,0.526559,...,-0.681103,1.013983,-1.332902,0.103006,-0.402818,-0.547466,0.053653,0.195612,-0.315242,-0.056009
1,0.710961,0.711069,0.835526,0.723130,-0.104670,0.121416,0.449309,0.669020,0.363114,0.526559,...,-0.681103,-0.332473,0.419534,-1.434484,-0.402818,-0.547466,0.053653,-1.389674,-0.315242,0.060057
2,-0.228321,0.086147,-0.092564,0.143097,-0.104670,0.121416,2.052487,1.615819,1.232601,1.026016,...,-0.681103,0.821632,0.419534,-1.434484,-0.402818,-0.547466,0.053653,0.195612,-0.315242,0.060057
3,-0.228321,-0.538776,-0.092564,-0.436936,-0.104670,0.704417,1.250898,1.142419,1.232601,1.525473,...,-0.681103,0.052229,0.419534,1.714029,-0.402818,-0.547466,0.053653,1.384577,-0.315242,0.698423
4,0.710961,0.711069,-0.092564,0.143097,-0.104670,0.121416,1.250898,1.142419,0.363114,0.526559,...,-0.681103,0.917808,0.419534,-1.434484,-0.402818,-0.547466,0.053653,0.195612,-0.315242,0.118090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73564,-1.167603,-1.163699,-1.020654,-1.016970,-1.045356,-1.044586,0.449309,0.195620,0.363114,0.027101,...,1.468207,1.300284,-1.332902,0.640014,2.482512,2.499069,-1.940047,-1.389674,0.290128,-1.332740
73565,-1.271254,-1.301621,-1.020654,-0.952962,-0.208476,-0.525920,-0.352280,-0.277779,-0.506373,-0.472356,...,1.231028,1.098888,-2.209120,-1.505788,-0.402818,0.061841,2.047353,-0.879100,-0.315242,-0.804037
73566,-1.275627,-1.307441,-1.020654,-0.950262,-0.212856,-0.528634,-0.352280,-0.277779,-0.506373,-0.472356,...,1.221021,1.102470,-2.209120,-1.505788,-0.402818,0.061841,2.047353,-0.877624,-0.315242,-0.803767
73567,-1.068014,-0.406259,-1.751940,-1.931041,-1.886304,-2.025147,-0.267290,-0.177393,-1.191482,-1.312403,...,1.468207,2.096708,-2.209120,-1.505788,2.482512,2.499069,-1.940047,-1.389674,0.494958,-1.332740


In [18]:
minmax = StandardScaler()
for year in years:
    test_data[year]['X'] = minmax.fit_transform(test_data[year]['X'])
    test_data[year]['X'] = pd.DataFrame(test_data[year]['X'])

In [36]:
rf_predicts = [] 

for year in tqdm(years):
    pred = rf_models[year].predict(test_data[year]['X'])
    rf_predicts.extend(pred)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:08<00:00,  2.21s/it]


In [37]:
submission = pd.read_csv('../submission/sample_submission.csv') # sample submission 불러오기

In [38]:
submission['knowcode'] = rf_predicts

submission.to_csv('../submission/rf_smote_submission.csv', index=False)