In [1]:
import os
import pandas as pd 
import numpy as np
from multiprocessing import Pool 
import multiprocessing
from data_loader import data_loader
from data_loader_v2 import data_loader_v2
from tqdm import tqdm
from functools import partial
from sklearn.ensemble import RandomForestClassifier
import joblib

In [2]:
def data_loader_all(func, path, train, nrows, **kwargs):
    '''
    Parameters:
    
    func: 하나의 csv파일을 읽는 함수 
    path: [str] train용 또는 test용 csv 파일들이 저장되어 있는 폴더 
    train: [boolean] train용 파일들 불러올 시 True, 아니면 False
    nrows: [int] csv 파일에서 불러올 상위 n개의 row 
    lookup_table: [pd.DataFrame] train_label.csv 파일을 저장한 변수 
    event_time: [int] 상태_B 발생 시간 
    normal: [int] 상태_A의 라벨
    
    Return:
    
    combined_df: 병합된 train 또는 test data
    '''
    
    # 읽어올 파일들만 경로 저장 해놓기 
    files_in_dir = os.listdir(path)
    
    files_path = [path+'/'+file for file in files_in_dir]
    
    if train :
        func_fixed = partial(func, nrows = nrows, train = True, lookup_table = kwargs['lookup_table'], event_time = kwargs['event_time'], normal = kwargs['normal'])
        
    else : 
        func_fixed = partial(func, nrows = nrows, train = False)
    
    
    # 여러개의 코어를 활용하여 데이터 읽기 
    if __name__ == '__main__':
        pool = Pool(processes = multiprocessing.cpu_count()) 
        df_list = list(tqdm(pool.imap(func_fixed, files_path), total = len(files_path)))
        pool.close()
        pool.join()
    
    # 데이터 병합하기 
    combined_df = pd.concat(df_list, ignore_index=True)
    
    return combined_df

In [3]:
train_folder = 'train/'
test_folder = 'test/'
train_label_path = 'train_label.csv'

In [4]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [6]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=150)

In [7]:
train

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120,label
0,30.474394,8.691177,8.714483,8.687399,8.721230,207.697895,165.865730,-6.018877e-19,0.0,-0.002136,...,1.0,1.0,1.0,60.0,0.0,0.0,1.421620e-05,85.4,0.0,110
0,30.470463,8.736521,8.682769,8.717135,8.682402,192.665080,191.006871,-3.918758e-19,0.0,0.001710,...,1.0,1.0,1.0,60.0,0.0,0.0,-6.114455e-06,85.4,0.0,110
0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.799179e-19,0.0,0.000493,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.813291e-05,85.4,0.0,110
0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636971e-19,0.0,0.000318,...,1.0,1.0,1.0,60.0,0.0,0.0,-5.745568e-07,85.4,0.0,110
0,30.475773,8.790241,8.735125,8.703167,8.721030,193.269046,195.984890,-6.379752e-20,0.0,-0.000091,...,1.0,1.0,1.0,60.0,0.0,0.0,8.437883e-06,85.4,0.0,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,30.474310,8.770017,8.700827,8.749124,8.720934,178.231694,199.723901,9.011728e-20,0.0,0.001813,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.173679e-05,85.4,0.0,156
99,30.458116,8.536275,8.714917,8.773614,8.702142,193.271535,201.506292,7.890852e-20,0.0,0.001790,...,1.0,1.0,1.0,60.0,0.0,0.0,9.895865e-07,85.4,0.0,156
99,30.480659,8.721801,8.709552,8.696202,8.750277,179.114332,221.118574,-1.963198e-19,0.0,-0.000268,...,1.0,1.0,1.0,60.0,0.0,0.0,8.706569e-06,85.4,0.0,156
99,30.474591,8.740609,8.719405,8.663344,8.708356,197.932121,163.922483,-4.248289e-19,0.0,-0.001411,...,1.0,1.0,1.0,60.0,0.0,0.0,1.367703e-05,85.4,0.0,156


In [8]:
columns = train.columns

In [9]:
real_var = []
for i in columns:
    if len(train[i].unique()) != 1:
        real_var.append(i)

In [10]:
len(real_var)

3565

In [11]:
train_red = train[real_var]

In [12]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [13]:
test

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5111,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120
1000,30.465741,8.618514,8.705075,8.730912,8.699214,181.327530,201.889419,2.393806e-19,0.0,0.003496,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000004,85.4,0.0
1000,30.477302,8.642689,8.713423,8.732450,8.694666,203.347675,155.790045,-1.808861e-19,0.0,0.001969,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000005,85.4,0.0
1000,30.478336,8.675928,8.729837,8.672877,8.710215,196.673652,227.039249,6.236627e-19,0.0,-0.002462,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000016,85.4,0.0
1000,30.462904,8.733765,8.706455,8.691974,8.696285,194.365551,167.436935,2.845012e-20,0.0,-0.000045,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-0.000010,85.4,0.0
1000,30.483675,8.807382,8.680733,8.713651,8.664766,205.369347,154.245975,-2.085638e-19,0.0,-0.000713,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-0.000008,85.4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999,30.462765,8.636107,8.717540,8.714714,8.673677,209.395256,221.627222,-5.455288e-19,0.0,-0.000699,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-0.000002,85.4,0.0
999,30.486345,8.687652,8.662559,8.723882,8.720834,193.784891,209.822565,-2.096920e-19,0.0,-0.000815,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-0.000005,85.4,0.0
999,30.471584,8.658865,8.692842,8.704296,8.719990,172.802618,211.979032,1.417866e-19,0.0,-0.003038,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000006,85.4,0.0
999,30.464907,8.784168,8.716839,8.723631,8.734383,176.388590,158.344648,-5.953593e-19,0.0,-0.001622,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000003,85.4,0.0


In [14]:
real_var.remove('label')

In [15]:
test = test[real_var]

In [17]:
train_red 

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5088,V5089,V5090,V5115,V5116,V5117,V5118,V5119,V5120,label
0,30.474394,8.691177,8.714483,8.687399,8.721230,207.697895,165.865730,-6.018877e-19,0.0,-0.002136,...,-0.199740,-0.155360,43.204967,60.0,0.0,0.0,1.421620e-05,85.4,0.0,110
0,30.470463,8.736521,8.682769,8.717135,8.682402,192.665080,191.006871,-3.918758e-19,0.0,0.001710,...,-0.183798,-0.149832,43.189223,60.0,0.0,0.0,-6.114455e-06,85.4,0.0,110
0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.799179e-19,0.0,0.000493,...,-0.173975,-0.160714,43.193726,60.0,0.0,0.0,-1.813291e-05,85.4,0.0,110
0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636971e-19,0.0,0.000318,...,-0.135186,-0.147648,43.207052,60.0,0.0,0.0,-5.745568e-07,85.4,0.0,110
0,30.475773,8.790241,8.735125,8.703167,8.721030,193.269046,195.984890,-6.379752e-20,0.0,-0.000091,...,-0.193827,-0.141245,43.200405,60.0,0.0,0.0,8.437883e-06,85.4,0.0,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,30.474310,8.770017,8.700827,8.749124,8.720934,178.231694,199.723901,9.011728e-20,0.0,0.001813,...,-0.210776,-0.182267,43.201399,60.0,0.0,0.0,-1.173679e-05,85.4,0.0,156
99,30.458116,8.536275,8.714917,8.773614,8.702142,193.271535,201.506292,7.890852e-20,0.0,0.001790,...,-0.187443,-0.170508,43.196128,60.0,0.0,0.0,9.895865e-07,85.4,0.0,156
99,30.480659,8.721801,8.709552,8.696202,8.750277,179.114332,221.118574,-1.963198e-19,0.0,-0.000268,...,-0.197799,-0.139520,43.197053,60.0,0.0,0.0,8.706569e-06,85.4,0.0,156
99,30.474591,8.740609,8.719405,8.663344,8.708356,197.932121,163.922483,-4.248289e-19,0.0,-0.001411,...,-0.254232,-0.180736,43.189672,60.0,0.0,0.0,1.367703e-05,85.4,0.0,156


In [18]:
X = train_red.drop(['label'],axis=1)

In [19]:
Y = train['label']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)


In [29]:
model = RandomForestClassifier(n_estimators=100,random_state=0, verbose=1, n_jobs=-1)
model.fit(X_train, Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.9min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [30]:
model.score(X_test,Y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    1.4s finished


0.9402173913043478

In [34]:
X = train_red.drop(['label'],axis=1)
Y = train['label']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [32]:
model = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1)
model.fit(X_train, Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   26.0s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [21]:
from lightgbm import LGBMClassifier, plot_importance

In [22]:
lgb = LGBMClassifier(objective='multiclass',learning_rate=0.01)
evals = [(X_test, Y_test)]
lgb.fit(X_train,Y_train, early_stopping_rounds = 10, eval_metric = 'logloss', eval_set = evals, verbose =True)

[1]	valid_0's multi_logloss: 4.00643
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 3.76531
[3]	valid_0's multi_logloss: 3.58216
[4]	valid_0's multi_logloss: 3.43284
[5]	valid_0's multi_logloss: 3.30584
[6]	valid_0's multi_logloss: 3.19513
[7]	valid_0's multi_logloss: 3.0965
[8]	valid_0's multi_logloss: 3.00769
[9]	valid_0's multi_logloss: 2.92694
[10]	valid_0's multi_logloss: 2.85301
[11]	valid_0's multi_logloss: 2.78463
[12]	valid_0's multi_logloss: 2.72092
[13]	valid_0's multi_logloss: 2.66131
[14]	valid_0's multi_logloss: 2.60532
[15]	valid_0's multi_logloss: 2.55268
[16]	valid_0's multi_logloss: 2.50293
[17]	valid_0's multi_logloss: 2.45579
[18]	valid_0's multi_logloss: 2.4108
[19]	valid_0's multi_logloss: 2.36795
[20]	valid_0's multi_logloss: 2.32682
[21]	valid_0's multi_logloss: 2.28753
[22]	valid_0's multi_logloss: 2.24989
[23]	valid_0's multi_logloss: 2.21364
[24]	valid_0's multi_logloss: 2.17874
[25]	valid_0's multi_logloss: 2.14523


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.01, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31,
               objective='multiclass', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [24]:
flg, ax = plt.subplots(fig_size=(10,6))
plot_importance(lgb, ax= ax)

NameError: name 'plt' is not defined

In [23]:
lgb.score(X_test,Y_test)

0.9452208419599724

In [31]:
model.fit(X,Y)

NameError: name 'model' is not defined

In [25]:
pred = lgb.predict_proba(test)

In [26]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()

In [27]:
submission

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
828,0.001161,0.001105,0.001115,0.001682,0.001718,0.001723,0.001728,0.001754,0.006972,0.001669,...,0.000543,0.000604,0.000548,0.000548,0.001764,0.001166,0.000543,0.000590,0.000569,0.000568
828,0.001144,0.001088,0.001098,0.001657,0.001692,0.001697,0.001702,0.001727,0.006866,0.001644,...,0.000534,0.000595,0.000539,0.000539,0.001737,0.001149,0.000534,0.000583,0.000560,0.000646
828,0.001096,0.001043,0.001053,0.001588,0.001622,0.001626,0.001631,0.001655,0.006580,0.001575,...,0.000512,0.000570,0.000517,0.000517,0.001665,0.001101,0.000512,0.000557,0.000537,0.003461
828,0.001125,0.001070,0.001080,0.001629,0.001664,0.001669,0.001674,0.001699,0.006752,0.001616,...,0.000525,0.000585,0.000530,0.000530,0.001709,0.001130,0.000525,0.000572,0.000551,0.000618
828,0.001110,0.001056,0.001066,0.001608,0.001642,0.001647,0.001652,0.001676,0.006663,0.001595,...,0.000519,0.000577,0.000523,0.000523,0.001686,0.001115,0.000519,0.000564,0.000544,0.001188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547,0.001291,0.001228,0.001240,0.001870,0.001910,0.001915,0.001921,0.001949,0.007748,0.001855,...,0.000603,0.000671,0.000609,0.000609,0.001961,0.001296,0.000603,0.000656,0.000632,0.000631
1547,0.001291,0.001228,0.001240,0.001870,0.001910,0.001916,0.001921,0.001950,0.007749,0.001855,...,0.000603,0.000671,0.000609,0.000609,0.001961,0.001297,0.000603,0.000656,0.000633,0.000632
1547,0.001291,0.001228,0.001240,0.001870,0.001910,0.001916,0.001921,0.001950,0.007749,0.001855,...,0.000603,0.000671,0.000609,0.000609,0.001961,0.001297,0.000603,0.000656,0.000633,0.000632
1547,0.001290,0.001227,0.001239,0.001869,0.001908,0.001914,0.001920,0.001948,0.007742,0.001854,...,0.000603,0.000671,0.000608,0.000608,0.001959,0.001295,0.000603,0.000656,0.000656,0.000631


In [43]:
def max_or_min(arr):
    if arr.mean() > 0.5:
        return arr.max()
    else:
        return arr.min()

In [72]:
from collections import Counter
def mode(x):
    # 최빈값이 하나보다 많다면 list를 반환
    counts = Counter(x)
    max_count = max(counts.values())
    return [x_i for x_i, count in counts.items() if count == max_count][0]

In [28]:
submission = submission.groupby('id').mean()

In [73]:
submission = submission.groupby('id').agg(mode)

In [90]:
submission = submission.groupby('id').sum()

In [29]:
submission

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
828,0.001228,0.001169,0.001210,0.001779,0.001817,0.001823,0.001828,0.001860,0.007373,0.001765,...,0.000574,0.000639,0.000579,0.000579,0.001866,0.001249,0.000578,0.000634,0.000602,0.000955
829,0.001312,0.001249,0.001263,0.001901,0.001945,0.001947,0.002027,0.001982,0.007876,0.001886,...,0.000613,0.000682,0.000619,0.000619,0.001993,0.001329,0.000626,0.000667,0.000643,0.000642
830,0.001291,0.001228,0.001240,0.001870,0.001913,0.001916,0.001927,0.001950,0.007749,0.001855,...,0.000603,0.000671,0.000609,0.000609,0.001961,0.001325,0.000624,0.000656,0.000827,0.000632
831,0.001327,0.001263,0.001297,0.001922,0.002264,0.004040,0.002809,0.002004,0.007964,0.001907,...,0.000620,0.000690,0.000626,0.000626,0.002016,0.001372,0.000624,0.000681,0.000650,0.000705
832,0.001798,0.001368,0.001317,0.002517,0.001950,0.002105,0.002086,0.002097,0.007614,0.002388,...,0.000593,0.000660,0.000598,0.000598,0.001927,0.001292,0.000617,0.000649,0.000626,0.000658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,0.001337,0.001473,0.001418,0.001935,0.002190,0.002097,0.002070,0.002344,0.008001,0.002374,...,0.000623,0.000693,0.000629,0.000629,0.002025,0.001376,0.000629,0.000686,0.000653,0.000980
1544,0.001156,0.001100,0.001115,0.001674,0.001710,0.001715,0.001720,0.001748,0.006936,0.001661,...,0.522627,0.000601,0.000545,0.000545,0.001755,0.004679,0.000820,0.000597,0.000577,0.000632
1545,0.001297,0.001234,0.001245,0.001879,0.001919,0.001924,0.001930,0.001959,0.007785,0.001864,...,0.000606,0.000674,0.000612,0.000612,0.001970,0.001310,0.000630,0.000666,0.000635,0.000690
1546,0.001288,0.001225,0.001237,0.001865,0.001905,0.001911,0.001916,0.001945,0.007729,0.001850,...,0.000602,0.000670,0.000607,0.000607,0.001956,0.001299,0.000628,0.000661,0.000637,0.000630


In [84]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

In [87]:
submission2 = min_max_scaler.fit_transform(submission)

In [91]:
submission[:] = submission2

In [33]:
submission

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
828,0.001228,0.001169,0.001210,0.001779,0.001817,0.001823,0.001828,0.001860,0.007373,0.001765,...,0.000574,0.000639,0.000579,0.000579,0.001866,0.001249,0.000578,0.000634,0.000602,0.000955
829,0.001312,0.001249,0.001263,0.001901,0.001945,0.001947,0.002027,0.001982,0.007876,0.001886,...,0.000613,0.000682,0.000619,0.000619,0.001993,0.001329,0.000626,0.000667,0.000643,0.000642
830,0.001291,0.001228,0.001240,0.001870,0.001913,0.001916,0.001927,0.001950,0.007749,0.001855,...,0.000603,0.000671,0.000609,0.000609,0.001961,0.001325,0.000624,0.000656,0.000827,0.000632
831,0.001327,0.001263,0.001297,0.001922,0.002264,0.004040,0.002809,0.002004,0.007964,0.001907,...,0.000620,0.000690,0.000626,0.000626,0.002016,0.001372,0.000624,0.000681,0.000650,0.000705
832,0.001798,0.001368,0.001317,0.002517,0.001950,0.002105,0.002086,0.002097,0.007614,0.002388,...,0.000593,0.000660,0.000598,0.000598,0.001927,0.001292,0.000617,0.000649,0.000626,0.000658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,0.001337,0.001473,0.001418,0.001935,0.002190,0.002097,0.002070,0.002344,0.008001,0.002374,...,0.000623,0.000693,0.000629,0.000629,0.002025,0.001376,0.000629,0.000686,0.000653,0.000980
1544,0.001156,0.001100,0.001115,0.001674,0.001710,0.001715,0.001720,0.001748,0.006936,0.001661,...,0.522627,0.000601,0.000545,0.000545,0.001755,0.004679,0.000820,0.000597,0.000577,0.000632
1545,0.001297,0.001234,0.001245,0.001879,0.001919,0.001924,0.001930,0.001959,0.007785,0.001864,...,0.000606,0.000674,0.000612,0.000612,0.001970,0.001310,0.000630,0.000666,0.000635,0.000690
1546,0.001288,0.001225,0.001237,0.001865,0.001905,0.001911,0.001916,0.001945,0.007729,0.001850,...,0.000602,0.000670,0.000607,0.000607,0.001956,0.001299,0.000628,0.000661,0.000637,0.000630


In [30]:
submission.to_csv('submission11.csv', index=True)

In [35]:
train2 = data_loader_all_v2(data_loader_v2, train_list, folder='train_fix/', train_label=train_label, event_time=10, nrows=100)

In [38]:
train2 = train2.drop(['time'],axis=1)

In [39]:
columns2 = train2.columns

In [40]:
real_var = []
for i in columns:
    if len(train[i].unique()) != 1:
        real_var.append(i)

In [41]:
train2_red = train2[real_var]

In [44]:
X2 = train2_red.drop(['label'],axis=1)
Y2 = train2['label']

In [45]:
model.fit(X2,Y2)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.5min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [46]:
pred = model.predict_proba(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    2.2s finished


In [52]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').median()

In [53]:
submission

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
828,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.000,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0,0.0
829,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.000,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0,0.0
830,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.000,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0,0.0
831,0.0,0.0,0.0,0.0,0.06,0.11,0.07,0.06,0.0,0.0,...,0.000,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0,0.0
832,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.000,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.000,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0,0.0
1544,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.795,0.0,0.095,0.0,0.0,0.01,0.01,0.0,0.0,0.0
1545,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.000,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0,0.0
1546,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.000,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0,0.0


In [99]:
X[:] = min_max_scaler.fit_transform(X)

In [125]:
X

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5087,V5088,V5089,V5090,V5115,V5116,V5117,V5118,V5119,V5120
0,0.889974,0.411781,0.415645,0.437885,0.440912,0.697332,0.578185,0.000497,0.0,0.000711,...,0.609464,0.106968,0.019088,0.624051,0.999945,0.0,0.0,9.908259e-07,1.0,0.0
0,0.889859,0.415784,0.412731,0.440492,0.437503,0.657102,0.635361,0.000497,0.0,0.000726,...,0.437170,0.121571,0.020687,0.308925,0.999945,0.0,0.0,7.031755e-07,1.0,0.0
0,0.889712,0.417289,0.410955,0.438994,0.442046,0.642116,0.639212,0.000497,0.0,0.000721,...,0.243139,0.130569,0.017540,0.399049,0.999945,0.0,0.0,5.331311e-07,1.0,0.0
0,0.889510,0.413889,0.415679,0.440496,0.438982,0.645956,0.610672,0.000497,0.0,0.000720,...,0.601975,0.166099,0.021319,0.665786,0.999945,0.0,0.0,7.815573e-07,1.0,0.0
0,0.890014,0.420528,0.417541,0.439268,0.440894,0.658718,0.646682,0.000497,0.0,0.000719,...,0.310188,0.112384,0.023171,0.532748,0.999945,0.0,0.0,9.090708e-07,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,0.889971,0.418742,0.414390,0.443297,0.440886,0.618477,0.655186,0.000497,0.0,0.000726,...,0.449968,0.096859,0.011306,0.552642,0.999945,0.0,0.0,6.236273e-07,1.0,0.0
99,0.889498,0.398103,0.415684,0.445444,0.439236,0.658725,0.659239,0.000497,0.0,0.000726,...,0.397810,0.118232,0.014707,0.447141,0.999945,0.0,0.0,8.036878e-07,1.0,0.0
99,0.890157,0.414485,0.415192,0.438657,0.443462,0.620839,0.703842,0.000497,0.0,0.000718,...,0.361742,0.108746,0.023670,0.465659,0.999945,0.0,0.0,9.128723e-07,1.0,0.0
99,0.889979,0.416145,0.416097,0.435776,0.439781,0.671197,0.573766,0.000497,0.0,0.000714,...,0.687184,0.057054,0.011749,0.317903,0.999945,0.0,0.0,9.831975e-07,1.0,0.0


In [100]:
model.fit(X,Y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.2min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)