In [3]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import warnings

warnings.filterwarnings(action='ignore') 
know_train = [pd.read_csv(path) for path in sorted(glob('../train/*.csv'))]

In [9]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [10]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [91]:
from sklearn.model_selection import train_test_split

split_data = {}
for year in tqdm(years):
    X_train,X_test,y_train,y_test = train_test_split(train_data[year]['X'],train_data[year]['y'],test_size=0.25, random_state=42)
    split_data[year] = {'X_train' : X_train,
                       'y_train' : y_train,
                       'X_test' : X_test,
                       'y_test' : y_test}

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 31.15it/s]


In [92]:
print(len(split_data[year]['X_train']))
print(len(split_data[year]['X_test']))
print(len(split_data[year]['y_train']))
print(len(split_data[year]['y_test']))

6091
2031
6091
2031


In [93]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

dt_models = {}

for year in tqdm(years):
    model = DecisionTreeClassifier(random_state=123456)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    dt_models[year] = model

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.64s/it]


In [None]:
rf_models = {}

for year in tqdm(years):
    model = RandomForestClassifier(n_estimators=100, random_state=123456, n_jobs=8)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    rf_models[year] = model

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

In [64]:
train_data[year]

{'X':       aq1_1  aq1_2  aq2_1  aq2_2  aq3_1  aq3_2  aq4_1  aq4_2  aq5_1  aq5_2  \
 0         3      3      3      3      3      3      4      4      3      4   
 1         4      5      4      5      3      4      3      4      3      4   
 2         3      4      3      4      3      4      5      6      4      5   
 3         3      3      3      3      3      5      4      5      4      6   
 4         4      5      3      4      3      4      4      5      3      4   
 ...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
 9481      3      5      2      4      3      3      2      2      2      3   
 9482      5      5      5      5      5      5      3      4      4      5   
 9483      3      3      4      6      3      3      4      5      4      5   
 9484      3      5      3      5      4      5      3      4      3      5   
 9485      3      4      3      4      3      4      3      4      3      4   
 
       ...  bq36  bq37  bq38  bq38_1  bq39_1 

In [14]:
know_test = [pd.read_csv(path) for path in sorted(glob('../test/*.csv'))]
know_test[0].head() # 2017년도 test 샘플

Unnamed: 0,idx,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,...,bq36,bq37,bq38,bq38_1,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3
0,0,3,4,2,2,3,3,1,,3,...,2,26,3,비서학,1,1,1,3000,,2300
1,1,5,5,3,5,5,5,5,5.0,4,...,1,57,4,농화학,1,1,1,5500,,2500
2,2,5,5,5,4,5,4,1,,1,...,1,31,4,신문방송,1,1,1,4300,,4000
3,3,4,5,5,6,4,6,3,4.0,4,...,1,35,6,화학,1,1,1,4100,,3000
4,4,5,6,4,5,4,5,1,,1,...,1,36,4,광고홍보,1,1,1,2800,,2000


In [15]:
for df in know_test:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [16]:
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns:
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) # train set에서 보지못한 카테고리변수 -1(UNK) 처리
            
            

2017
2018
2019
2020


In [17]:
know_train[0]['aq1_1'].value_counts()

3    3116
4    3007
2    1706
5    1072
1     585
Name: aq1_1, dtype: int64

In [18]:
know_test[0].head() # 2017년도 test 샘플

Unnamed: 0,idx,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,...,bq36,bq37,bq38,bq38_1,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3
0,0,3,4,2,2,3,3,1,0,3,...,2,26,3,497,1,1,1,3000,0,2300
1,1,5,5,3,5,5,5,5,5,4,...,1,57,4,287,1,1,1,5500,0,2500
2,2,5,5,5,4,5,4,1,0,1,...,1,31,4,705,1,1,1,4300,0,4000
3,3,4,5,5,6,4,6,3,4,4,...,1,35,6,1423,1,1,1,4100,0,3000
4,4,5,6,4,5,4,5,1,0,1,...,1,36,4,141,1,1,1,2800,0,2000


In [76]:
split_data[year]['X_train']

Unnamed: 0,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,aq5_2,...,bq36,bq37,bq38,bq38_1,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3
6263,3,4,4,5,4,5,2,2,3,4,...,1,44,4,419,1,3,2,4000,0,1200
8676,4,5,4,5,3,5,1,0,1,0,...,2,39,3,936,2,5,0,0,2700,0
4844,5,5,4,6,4,5,2,1,2,2,...,1,38,4,663,1,1,1,4200,0,3400
2492,3,3,1,0,1,0,2,3,1,0,...,1,61,1,1,1,4,2,3500,0,2000
8214,1,0,1,0,1,0,1,0,1,0,...,1,27,2,958,1,1,1,2700,0,1800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,4,6,4,6,4,6,1,0,4,5,...,1,59,3,220,1,1,1,4500,0,2300
5191,4,5,4,5,4,5,1,0,4,5,...,1,32,6,869,1,1,1,5500,0,4000
5390,4,6,5,6,4,5,3,4,4,5,...,2,29,6,592,1,1,1,3200,0,2800
860,4,5,5,6,4,6,3,1,3,4,...,2,33,5,151,1,2,2,1300,0,1300


In [75]:
dt_f1 =[]
for year in tqdm(years):
    y_pred = dt_models[year].predict(split_data[year]['X_train'])
    dt_f1.append(f1_score(y_train,y_pred))

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]


ValueError: Found input variables with inconsistent numbers of samples: [6091, 7114]

In [88]:
y_pred = dt_models[year].predict(split_data[year]['X_train'])

In [90]:
len(y_pred)

7114

In [82]:
len(y_pred),len(split_data[year]['X_train'])

(7114, 7114)

In [81]:
y_train

1600     26401
5601    873201
4303    411104
3126    303001
3459    842101
         ...  
5226    121102
5390    872301
860     901501
7603     29303
7270    885902
Name: knowcode, Length: 6091, dtype: int64

In [19]:
test_data = {}
for year, df in zip(years, know_test):
    test_data[year] =  {'X': df.iloc[:,1:]}

In [20]:
dt_predicts = [] 

for year in tqdm(years):
    pred = dt_models[year].predict(test_data[year]['X'])
    dt_predicts.extend(pred)
    
    

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.89it/s]


In [21]:
rf_predicts = [] 

for year in tqdm(years):
    pred = dt_models[year].predict(test_data[year]['X'])
    rf_predicts.extend(pred)
    
    

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 37.26it/s]


In [55]:
from sklearn.metrics import f1_score

dt_f1 = []
for year in tqdm(years):
    dt_f1.append(f1_score(train_data[year]['y'],dt_models[year].predict(train_data[year]['X']),average = 'weighted'))

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 22.52it/s]


In [56]:
dt_f1

[1.0, 1.0, 1.0, 1.0]

In [46]:
dt_models[year].predict(train_data[year]['X'])

array([825101, 140204, 140204, ..., 701101,  25402,  15201], dtype=int64)

In [49]:
train_data[year]['y']

0       825101
1       140204
2       140204
3       140601
4       140204
         ...  
9481    411301
9482    151105
9483    701101
9484     25402
9485     15201
Name: knowcode, Length: 9486, dtype: int64

In [None]:
submission = pd.read_csv('../sample_submission.csv')

In [None]:
submission['knowcode'] = rf_predicts

submission.to_csv('../submission/basline_submission.csv', index=False)