In [199]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import warnings
import numpy as np
warnings.filterwarnings(action='ignore') 
know_train = [pd.read_csv(path) for path in sorted(glob('../train/*.csv'))]

In [200]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [201]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [202]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.iloc[:, 1:-1], # ID제외
                        'y': df.iloc[:, -1]} 

In [203]:
pd.DataFrame(train_data[year]['X'])

Unnamed: 0,saq1_1,saq1_2,saq2_1,saq2_2,saq3_1,saq3_2,saq4_1,saq4_2,saq5_1,saq5_2,...,bq25,bq26,bq26_1,bq27_1,bq27_2,bq28,bq29,bq30_1,bq30_2,bq30_3
0,4,4,4,4,4,5,4,5,3,3,...,42,4,9,1,1,1,40,4000,2600,0
1,5,6,5,6,4,5,4,5,4,5,...,45,4,166,1,1,1,40,4000,3000,0
2,3,4,3,4,3,4,3,5,3,5,...,38,4,161,1,1,1,48,4300,3000,0
3,4,5,2,3,3,3,4,5,3,4,...,25,4,368,1,1,1,40,3500,3300,0
4,5,6,4,5,3,5,4,4,4,6,...,49,4,22,1,1,1,40,5700,2700,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8117,4,6,3,5,3,3,1,0,5,5,...,45,6,890,1,1,1,40,5100,4000,0
8118,3,4,4,5,2,4,4,6,1,0,...,33,4,96,1,1,1,45,3100,2600,0
8119,5,6,3,4,3,4,4,5,5,7,...,45,4,114,1,1,1,40,8000,3000,0
8120,5,6,5,5,4,5,3,3,3,4,...,36,5,1069,1,1,1,40,5700,3200,0


In [204]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
for year in years:
    train_data[year]['X'] = minmax.fit_transform(train_data[year]['X'])
    train_data[year]['X'] = pd.DataFrame(train_data[year]['X'])

In [205]:
from sklearn.model_selection import train_test_split

split_data = {}
for year in tqdm(years):
    X_train,X_test,y_train,y_test = train_test_split(train_data[year]['X'],train_data[year]['y'],test_size=0.25, random_state=42)
    split_data[year] = {'X_train' : X_train,
                       'y_train' : y_train,
                       'X_test' : X_test,
                       'y_test' : y_test}

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 53.16it/s]


In [163]:
split_data[year]['X_train']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,173,174,175,176,177,178,179,180,181,182
1600,0.25,0.428571,0.25,0.571429,0.25,0.285714,0.50,0.428571,0.25,0.428571,...,0.393443,0.6,0.035879,0.0,0.000000,0.5,0.454545,0.150000,0.215385,0.000000
5601,0.25,0.285714,0.50,0.142857,0.25,0.285714,0.75,0.285714,0.75,0.142857,...,0.590164,0.2,0.148114,0.0,0.000000,0.5,0.519481,0.096429,0.207692,0.000000
4303,1.00,0.857143,1.00,0.857143,1.00,0.857143,1.00,0.714286,0.00,0.000000,...,0.245902,0.6,0.270469,0.0,0.000000,0.5,0.519481,0.125000,0.176923,0.000000
3126,1.00,0.857143,0.75,0.857143,0.25,0.428571,1.00,0.428571,0.25,0.428571,...,0.344262,0.6,0.525299,0.0,0.000000,0.5,0.519481,0.232143,0.307692,0.000000
3459,0.00,0.000000,0.25,0.285714,0.00,0.000000,0.00,0.000000,0.00,0.000000,...,0.573770,0.6,0.744250,0.0,0.000000,0.5,0.584416,0.175000,0.184615,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0.75,0.714286,1.00,1.000000,1.00,0.857143,0.75,0.714286,0.75,0.714286,...,0.262295,0.8,0.982521,0.0,0.000000,0.5,0.584416,0.160714,0.246154,0.000000
5390,0.50,0.285714,0.75,0.428571,0.00,0.000000,0.25,0.142857,0.50,0.285714,...,0.262295,0.6,0.479301,0.0,0.000000,0.5,0.558442,0.132143,0.230769,0.000000
860,0.00,0.000000,0.25,0.285714,0.00,0.000000,0.50,0.285714,0.25,0.142857,...,0.131148,0.6,0.794848,0.0,0.333333,1.0,0.454545,0.100000,0.184615,0.000000
7603,0.50,0.428571,0.50,0.428571,0.25,0.428571,0.50,0.571429,0.25,0.285714,...,0.524590,0.2,0.269549,0.0,0.333333,1.0,0.532468,0.064286,0.076923,0.000000


In [31]:
from lightgbm import LGBMClassifier
lg_models = {}

for year in tqdm(years):
    model = LGBMClassifier(random_state=1,objective = 'multiclass',n_estimators=400,n_jobs=8)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'],eval_metric = 'logloss')
    lg_models[year] = model

  0%|                                                                                            | 0/4 [02:22<?, ?it/s]


KeyboardInterrupt: 

In [26]:
from sklearn.metrics import f1_score
lg_f1 =[]
for year in tqdm(years):
    y_pred = lg_models[year].predict(split_data[year]['X_test'])
    lg_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
np.mean(lg_f1)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.75s/it]


0.07185701695129024

In [27]:
lg_f1

[0.05902192242833052,
 0.09171075837742504,
 0.06825619448340346,
 0.06843919251600197]

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rf_models = {}

for year in tqdm(years):
    model = RandomForestClassifier(n_estimators=100, random_state=123456, n_jobs=8)
    model.fit(split_data[year]['X_train'], split_data[year]['y_train'])
    rf_models[year] = model

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:13<00:00,  3.29s/it]


In [30]:
rf_f1 =[]
for year in tqdm(years):
    y_pred = rf_models[year].predict(split_data[year]['X_test'])
    rf_f1.append(f1_score(split_data[year]['y_test'],y_pred,average ='micro'))
np.mean(rf_f1)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.67s/it]


0.5184834483607411

In [39]:
for year in years:
    print(len(split_data[year]['y_train'].value_counts()))


538
576
559
537


In [206]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random



In [207]:
from torchvision import transforms


In [208]:
from torch.utils.data import Dataset, DataLoader
class KNOW(Dataset):
    def __init__(self,df, labels = None):
        self.df = df
        self.label = labels
    def __getitem__(self,idx):
        if self.label is not None:
            return torch.tensor(self.df.iloc[idx]),torch.tensor(self.label.iloc[idx])
        return self.df.iloc[idx]
    def __len__(self):
        return len(self.df)

In [209]:
dataset = KNOW(split_data['2017']['X_train'],split_data['2017']['y_train'])
loader = DataLoader(dataset, batch_size = batch_size, shuffle = True)

In [268]:
split_data['2017']['y_train']

6263    416701
8676    615301
4844    212102
2492    702301
8214    420202
         ...  
5734     28101
5191    155106
5390    121102
860     110105
7270    521201
Name: knowcode, Length: 7114, dtype: int64

In [294]:
class Model(nn.Module):
    def __init__(self,input_dim,num_class):
        super().__init__()
        self.net = nn.Sequential(
        nn.Linear(input_dim, 128), # Output이 3!
        nn.ReLU(),
        nn.Linear(128, 64),
        
        )
    def forward(self, x):
        
        return  self.net(x)
    
model = Model(split_data['2017']['X_train'].shape[1],len(split_data[year]['y_train'].value_counts()))

In [295]:
from torchsummary import summary
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = torch.randn((1,64,154)).float().to(device)
model = model.to(device)
output_Stem = model(x)
print('Input size:', x.size())
print('Stem output size:', output_Stem.size())

Input size: torch.Size([1, 64, 154])
Stem output size: torch.Size([1, 64, 64])


In [231]:
torch.tensor(np.array(split_data['2017']['X_train']))

tensor([[0.5000, 0.5714, 0.7500,  ..., 0.0571, 0.0000, 0.0200],
        [0.7500, 0.7143, 0.7500,  ..., 0.0000, 0.0675, 0.0000],
        [1.0000, 0.7143, 0.7500,  ..., 0.0600, 0.0000, 0.0567],
        ...,
        [0.7500, 0.8571, 1.0000,  ..., 0.0457, 0.0000, 0.0467],
        [0.7500, 0.7143, 1.0000,  ..., 0.0186, 0.0000, 0.0217],
        [0.7500, 0.5714, 0.2500,  ..., 0.0314, 0.0000, 0.0250]],
       dtype=torch.float64)

In [247]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu' # 학습에 사용할 device 선언
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-5)
ce_loss = nn.CrossEntropyLoss()  # loss함수 선언 cross entrophy loss

model.to(device)
for Epoch in tqdm(range(30)):
    for batch, labels in loader:
        batch = batch.to(device)
        labels = labels.to(device)
        
        output = model(batch) # 모델에 이미지들을 넣은 뒤 값 출력
        loss = ce_loss(output,labels) # loss계산
        loss.backward() # 오차를 역전파
        optimizer.step()
        optimizer.zero_grad()
        acc = compute_acc(labels.detach().cpu().numpy(),output.detach().cpu().numpy().argmax(-1))
        
    if Epoch % 10 == 0 or Epoch == 29:
        print(f'EPOCH : {Epoch}, loss : {loss}, acc : {acc}')

  0%|                                                                                           | 0/30 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x154 and 7114x537)

In [236]:
 torch.tensor(np.array(split_data['2017']['y_train']),dtype=torch.int64)

tensor([416701, 615301, 212102,  ..., 121102, 110105, 521201])

In [237]:
X_train

tensor([416701, 615301, 212102,  ..., 121102, 110105, 521201], device='cuda:0')

In [245]:
torch.tensor(np.array(split_data['2017']['X_train']),dtype=torch.float32)

tensor([[0.5000, 0.5714, 0.7500,  ..., 0.0571, 0.0000, 0.0200],
        [0.7500, 0.7143, 0.7500,  ..., 0.0000, 0.0675, 0.0000],
        [1.0000, 0.7143, 0.7500,  ..., 0.0600, 0.0000, 0.0567],
        ...,
        [0.7500, 0.8571, 1.0000,  ..., 0.0457, 0.0000, 0.0467],
        [0.7500, 0.7143, 1.0000,  ..., 0.0186, 0.0000, 0.0217],
        [0.7500, 0.5714, 0.2500,  ..., 0.0314, 0.0000, 0.0250]])