In [1]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader import data_loader # 자체적으로 만든 data loader ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)
from tqdm import tqdm
import joblib # 모델을 저장하고 불러오는 역할

import warnings
warnings.filterwarnings('ignore')

In [2]:
def data_loader_all(func, path, train, nrows, **kwargs):
    '''
    Parameters:
    
    func: 하나의 csv파일을 읽는 함수 
    path: [str] train용 또는 test용 csv 파일들이 저장되어 있는 폴더 
    train: [boolean] train용 파일들 불러올 시 True, 아니면 False
    nrows: [int] csv 파일에서 불러올 상위 n개의 row 
    lookup_table: [pd.DataFrame] train_label.csv 파일을 저장한 변수 
    event_time: [int] 상태_B 발생 시간 
    normal: [int] 상태_A의 라벨
    
    Return:
    
    combined_df: 병합된 train 또는 test data
    '''
    
    # 읽어올 파일들만 경로 저장 해놓기 
    files_in_dir = os.listdir(path)
    
    files_path = [path+'/'+file for file in files_in_dir]
    
    if train :
        func_fixed = partial(func, nrows = nrows, train = True, lookup_table = kwargs['lookup_table'], event_time = kwargs['event_time'], normal = kwargs['normal'])
        
    else : 
        func_fixed = partial(func, nrows = nrows, train = False)
    
    
    # 여러개의 코어를 활용하여 데이터 읽기 
    if __name__ == '__main__':
        pool = Pool(processes = multiprocessing.cpu_count()) 
        df_list = list(tqdm(pool.imap(func_fixed, files_path), total = len(files_path)))
        pool.close()
        pool.join()
    
    # 데이터 병합하기 
    combined_df = pd.concat(df_list, ignore_index=True)
    
    return combined_df


In [3]:
train_path = 'train'
test_path  = 'test'
label = pd.read_csv('train_label.csv')

In [4]:
train = data_loader_all(data_loader, path = train_path, train = True, nrows = 150, normal = 999, event_time = 10, lookup_table = label)
test  = data_loader_all(data_loader, path = test_path, train = False, nrows = 60)

100%|████████████████████████████████████████████████████████████████████████████████| 828/828 [01:24<00:00,  9.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 720/720 [00:57<00:00, 12.56it/s]


In [5]:
newTrain = train[train['label']!=999]
X = newTrain.drop(['label', 'time', 'id'], axis=1)
y = newTrain['label']

In [6]:
newTest = test[test['time']>=10]
newTest = newTest.drop(['time'], axis=1)

In [7]:
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import MinMaxScaler

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=7)

In [9]:
d_train = lgbm.Dataset(X_train, y_train, free_raw_data=False)
d_valid = lgbm.Dataset(X_valid, y_valid, free_raw_data=False)

In [10]:
params = {'task': 'train',
    'boosting_type': 'goss',
    #'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class':198,
    'metric': 'multi_logloss',
    'learning_rate': 0.002233,
    'max_depth': 9,
    #'num_leaves': 75,
    'feature_fraction': 0.7,
    'scale_pos_weight': 1.2,
    #'bagging_fraction': 0.4,
    #'bagging_freq': 10,
    #'bagging_seed': 7,
    'seed': 7,
    'save_binary': True}
    #'device':'gpu'}

In [11]:
evals_result = {}

In [12]:
lgb_clf = lgbm.train(params, d_train, 3000, valid_sets=[d_valid], verbose_eval=50, early_stopping_rounds=100, evals_result=evals_result)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 2.89327
[100]	valid_0's multi_logloss: 2.31576
[150]	valid_0's multi_logloss: 1.959
[200]	valid_0's multi_logloss: 1.70164
[250]	valid_0's multi_logloss: 1.50211
[300]	valid_0's multi_logloss: 1.34026
[350]	valid_0's multi_logloss: 1.20539
[400]	valid_0's multi_logloss: 1.09089
[450]	valid_0's multi_logloss: 0.992453
[500]	valid_0's multi_logloss: 0.908807
[550]	valid_0's multi_logloss: 0.834904
[600]	valid_0's multi_logloss: 0.769373
[650]	valid_0's multi_logloss: 0.710883
[700]	valid_0's multi_logloss: 0.65864
[750]	valid_0's multi_logloss: 0.61201
[800]	valid_0's multi_logloss: 0.570275
[850]	valid_0's multi_logloss: 0.532816
[900]	valid_0's multi_logloss: 0.499189
[950]	valid_0's multi_logloss: 0.468978
[1000]	valid_0's multi_logloss: 0.441813
[1050]	valid_0's multi_logloss: 0.417309
[1100]	valid_0's multi_logloss: 0.395198
[1150]	valid_0's multi_logloss: 0.375244
[1200]	valid_0's multi_logl

In [26]:
pred = lgb_clf.predict(newTest.drop(['id'], axis=1))

result = pd.DataFrame(data=pred)
result.index = newTest.id
result.index.name = 'id'
result = result.sort_index()
result = result.groupby('id').mean()

"""scaler = MinMaxScaler(feature_range=(0.00001, 0.99999))
tmp    = []

for row in result.iterrows():
    scaler.fit(row[1].to_frame())
    tmp.append([r[0] for r in scaler.transform(row[1].to_frame())])
    
submission = pd.DataFrame(tmp, columns = result.columns)
submission.index = result.index
"""
result.to_csv('submission_last2.csv', index=True)    

In [27]:
result2 = pd.DataFrame(data = lgb_clf.predict(X_valid))

In [28]:
result2.to_csv('lgb3000_pred.csv')

In [25]:
from sklearn.metrics import mean_squared_error, r2_score
r2_score(y_valid, lgb_clf.predict(X_valid))

ValueError: y_true and y_pred have different number of output (1!=198)