In [1]:
import os
import pandas as pd 
import numpy as np
from multiprocessing import Pool 
import multiprocessing
from data_loader import data_loader
from data_loader_v2 import data_loader_v2
from tqdm import tqdm
from functools import partial
from sklearn.ensemble import RandomForestClassifier
import joblib

In [2]:
def data_loader_all(func, path, train, nrows, **kwargs):
    '''
    Parameters:
    
    func: 하나의 csv파일을 읽는 함수 
    path: [str] train용 또는 test용 csv 파일들이 저장되어 있는 폴더 
    train: [boolean] train용 파일들 불러올 시 True, 아니면 False
    nrows: [int] csv 파일에서 불러올 상위 n개의 row 
    lookup_table: [pd.DataFrame] train_label.csv 파일을 저장한 변수 
    event_time: [int] 상태_B 발생 시간 
    normal: [int] 상태_A의 라벨
    
    Return:
    
    combined_df: 병합된 train 또는 test data
    '''
    
    # 읽어올 파일들만 경로 저장 해놓기 
    files_in_dir = os.listdir(path)
    
    files_path = [path+'/'+file for file in files_in_dir]
    
    if train :
        func_fixed = partial(func, nrows = nrows, train = True, lookup_table = kwargs['lookup_table'], event_time = kwargs['event_time'], normal = kwargs['normal'])
        
    else : 
        func_fixed = partial(func, nrows = nrows, train = False)
    
    
    # 여러개의 코어를 활용하여 데이터 읽기 
    if __name__ == '__main__':
        pool = Pool(processes = multiprocessing.cpu_count()) 
        df_list = list(tqdm(pool.imap(func_fixed, files_path), total = len(files_path)))
        pool.close()
        pool.join()
    
    # 데이터 병합하기 
    combined_df = pd.concat(df_list, ignore_index=True)
    
    return combined_df

In [3]:
train_folder = 'train/'
test_folder = 'test/'
train_label_path = 'train_label.csv'

In [4]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [6]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=150)

In [7]:
train

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120,label
0,30.474394,8.691177,8.714483,8.687399,8.721230,207.697895,165.865730,-6.018877e-19,0.0,-0.002136,...,1.0,1.0,1.0,60.0,0.0,0.0,1.421620e-05,85.4,0.0,110
0,30.470463,8.736521,8.682769,8.717135,8.682402,192.665080,191.006871,-3.918758e-19,0.0,0.001710,...,1.0,1.0,1.0,60.0,0.0,0.0,-6.114455e-06,85.4,0.0,110
0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.799179e-19,0.0,0.000493,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.813291e-05,85.4,0.0,110
0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636971e-19,0.0,0.000318,...,1.0,1.0,1.0,60.0,0.0,0.0,-5.745568e-07,85.4,0.0,110
0,30.475773,8.790241,8.735125,8.703167,8.721030,193.269046,195.984890,-6.379752e-20,0.0,-0.000091,...,1.0,1.0,1.0,60.0,0.0,0.0,8.437883e-06,85.4,0.0,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,30.474310,8.770017,8.700827,8.749124,8.720934,178.231694,199.723901,9.011728e-20,0.0,0.001813,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.173679e-05,85.4,0.0,156
99,30.458116,8.536275,8.714917,8.773614,8.702142,193.271535,201.506292,7.890852e-20,0.0,0.001790,...,1.0,1.0,1.0,60.0,0.0,0.0,9.895865e-07,85.4,0.0,156
99,30.480659,8.721801,8.709552,8.696202,8.750277,179.114332,221.118574,-1.963198e-19,0.0,-0.000268,...,1.0,1.0,1.0,60.0,0.0,0.0,8.706569e-06,85.4,0.0,156
99,30.474591,8.740609,8.719405,8.663344,8.708356,197.932121,163.922483,-4.248289e-19,0.0,-0.001411,...,1.0,1.0,1.0,60.0,0.0,0.0,1.367703e-05,85.4,0.0,156


In [8]:
columns = train.columns

In [9]:
real_var = []
for i in columns:
    if len(train[i].unique()) != 1:
        real_var.append(i)

In [10]:
len(real_var)

3565

In [24]:
real_var

['V0000',
 'V0001',
 'V0002',
 'V0003',
 'V0004',
 'V0005',
 'V0006',
 'V0007',
 'V0008',
 'V0009',
 'V0010',
 'V0011',
 'V0012',
 'V0013',
 'V0014',
 'V0015',
 'V0016',
 'V0017',
 'V0018',
 'V0025',
 'V0026',
 'V0027',
 'V0028',
 'V0029',
 'V0030',
 'V0031',
 'V0032',
 'V0033',
 'V0040',
 'V0041',
 'V0042',
 'V0043',
 'V0044',
 'V0045',
 'V0046',
 'V0047',
 'V0048',
 'V0049',
 'V0050',
 'V0051',
 'V0052',
 'V0053',
 'V0054',
 'V0055',
 'V0056',
 'V0057',
 'V0058',
 'V0059',
 'V0060',
 'V0061',
 'V0062',
 'V0063',
 'V0064',
 'V0065',
 'V0066',
 'V0067',
 'V0068',
 'V0069',
 'V0070',
 'V0071',
 'V0072',
 'V0073',
 'V0074',
 'V0075',
 'V0076',
 'V0077',
 'V0078',
 'V0079',
 'V0080',
 'V0081',
 'V0082',
 'V0083',
 'V0084',
 'V0085',
 'V0086',
 'V0087',
 'V0088',
 'V0089',
 'V0090',
 'V0091',
 'V0092',
 'V0093',
 'V0094',
 'V0095',
 'V0096',
 'V0097',
 'V0098',
 'V0099',
 'V0100',
 'V0101',
 'V0102',
 'V0103',
 'V0104',
 'V0105',
 'V0106',
 'V0107',
 'V0108',
 'V0109',
 'V0110',
 'V0111',


In [65]:
write_txt(real_var,'real_var.txt')

In [78]:
file = open('real_var.txt','r')
real_var2 = file.readlines()
real_var2 = list(map(lambda s: s.strip(),real_var2))

In [79]:
len(real_var2)

3565

In [80]:
real_var2

['V0000',
 'V0001',
 'V0002',
 'V0003',
 'V0004',
 'V0005',
 'V0006',
 'V0007',
 'V0008',
 'V0009',
 'V0010',
 'V0011',
 'V0012',
 'V0013',
 'V0014',
 'V0015',
 'V0016',
 'V0017',
 'V0018',
 'V0025',
 'V0026',
 'V0027',
 'V0028',
 'V0029',
 'V0030',
 'V0031',
 'V0032',
 'V0033',
 'V0040',
 'V0041',
 'V0042',
 'V0043',
 'V0044',
 'V0045',
 'V0046',
 'V0047',
 'V0048',
 'V0049',
 'V0050',
 'V0051',
 'V0052',
 'V0053',
 'V0054',
 'V0055',
 'V0056',
 'V0057',
 'V0058',
 'V0059',
 'V0060',
 'V0061',
 'V0062',
 'V0063',
 'V0064',
 'V0065',
 'V0066',
 'V0067',
 'V0068',
 'V0069',
 'V0070',
 'V0071',
 'V0072',
 'V0073',
 'V0074',
 'V0075',
 'V0076',
 'V0077',
 'V0078',
 'V0079',
 'V0080',
 'V0081',
 'V0082',
 'V0083',
 'V0084',
 'V0085',
 'V0086',
 'V0087',
 'V0088',
 'V0089',
 'V0090',
 'V0091',
 'V0092',
 'V0093',
 'V0094',
 'V0095',
 'V0096',
 'V0097',
 'V0098',
 'V0099',
 'V0100',
 'V0101',
 'V0102',
 'V0103',
 'V0104',
 'V0105',
 'V0106',
 'V0107',
 'V0108',
 'V0109',
 'V0110',
 'V0111',


In [81]:
train[real_var2]

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5088,V5089,V5090,V5115,V5116,V5117,V5118,V5119,V5120,label
0,30.474394,8.691177,8.714483,8.687399,8.721230,207.697895,165.865730,-6.018877e-19,0.0,-0.002136,...,-0.199740,-0.155360,43.204967,60.0,0.0,0.0,1.421620e-05,85.4,0.0,110
0,30.470463,8.736521,8.682769,8.717135,8.682402,192.665080,191.006871,-3.918758e-19,0.0,0.001710,...,-0.183798,-0.149832,43.189223,60.0,0.0,0.0,-6.114455e-06,85.4,0.0,110
0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.799179e-19,0.0,0.000493,...,-0.173975,-0.160714,43.193726,60.0,0.0,0.0,-1.813291e-05,85.4,0.0,110
0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636971e-19,0.0,0.000318,...,-0.135186,-0.147648,43.207052,60.0,0.0,0.0,-5.745568e-07,85.4,0.0,110
0,30.475773,8.790241,8.735125,8.703167,8.721030,193.269046,195.984890,-6.379752e-20,0.0,-0.000091,...,-0.193827,-0.141245,43.200405,60.0,0.0,0.0,8.437883e-06,85.4,0.0,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,30.474310,8.770017,8.700827,8.749124,8.720934,178.231694,199.723901,9.011728e-20,0.0,0.001813,...,-0.210776,-0.182267,43.201399,60.0,0.0,0.0,-1.173679e-05,85.4,0.0,156
99,30.458116,8.536275,8.714917,8.773614,8.702142,193.271535,201.506292,7.890852e-20,0.0,0.001790,...,-0.187443,-0.170508,43.196128,60.0,0.0,0.0,9.895865e-07,85.4,0.0,156
99,30.480659,8.721801,8.709552,8.696202,8.750277,179.114332,221.118574,-1.963198e-19,0.0,-0.000268,...,-0.197799,-0.139520,43.197053,60.0,0.0,0.0,8.706569e-06,85.4,0.0,156
99,30.474591,8.740609,8.719405,8.663344,8.708356,197.932121,163.922483,-4.248289e-19,0.0,-0.001411,...,-0.254232,-0.180736,43.189672,60.0,0.0,0.0,1.367703e-05,85.4,0.0,156


In [13]:
train_red = train[real_var]

In [14]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [15]:
test

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5111,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120
1000,30.465741,8.618514,8.705075,8.730912,8.699214,181.327530,201.889419,2.393806e-19,0.0,0.003496,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000004,85.4,0.0
1000,30.477302,8.642689,8.713423,8.732450,8.694666,203.347675,155.790045,-1.808861e-19,0.0,0.001969,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000005,85.4,0.0
1000,30.478336,8.675928,8.729837,8.672877,8.710215,196.673652,227.039249,6.236627e-19,0.0,-0.002462,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000016,85.4,0.0
1000,30.462904,8.733765,8.706455,8.691974,8.696285,194.365551,167.436935,2.845012e-20,0.0,-0.000045,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-0.000010,85.4,0.0
1000,30.483675,8.807382,8.680733,8.713651,8.664766,205.369347,154.245975,-2.085638e-19,0.0,-0.000713,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-0.000008,85.4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999,30.462765,8.636107,8.717540,8.714714,8.673677,209.395256,221.627222,-5.455288e-19,0.0,-0.000699,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-0.000002,85.4,0.0
999,30.486345,8.687652,8.662559,8.723882,8.720834,193.784891,209.822565,-2.096920e-19,0.0,-0.000815,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,-0.000005,85.4,0.0
999,30.471584,8.658865,8.692842,8.704296,8.719990,172.802618,211.979032,1.417866e-19,0.0,-0.003038,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000006,85.4,0.0
999,30.464907,8.784168,8.716839,8.723631,8.734383,176.388590,158.344648,-5.953593e-19,0.0,-0.001622,...,1.0,1.0,1.0,1.0,60.0,0.0,0.0,0.000003,85.4,0.0


In [16]:
real_var.remove('label')

In [17]:
test = test[real_var]

In [18]:
train_red 

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5088,V5089,V5090,V5115,V5116,V5117,V5118,V5119,V5120,label
0,30.474394,8.691177,8.714483,8.687399,8.721230,207.697895,165.865730,-6.018877e-19,0.0,-0.002136,...,-0.199740,-0.155360,43.204967,60.0,0.0,0.0,1.421620e-05,85.4,0.0,110
0,30.470463,8.736521,8.682769,8.717135,8.682402,192.665080,191.006871,-3.918758e-19,0.0,0.001710,...,-0.183798,-0.149832,43.189223,60.0,0.0,0.0,-6.114455e-06,85.4,0.0,110
0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.799179e-19,0.0,0.000493,...,-0.173975,-0.160714,43.193726,60.0,0.0,0.0,-1.813291e-05,85.4,0.0,110
0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636971e-19,0.0,0.000318,...,-0.135186,-0.147648,43.207052,60.0,0.0,0.0,-5.745568e-07,85.4,0.0,110
0,30.475773,8.790241,8.735125,8.703167,8.721030,193.269046,195.984890,-6.379752e-20,0.0,-0.000091,...,-0.193827,-0.141245,43.200405,60.0,0.0,0.0,8.437883e-06,85.4,0.0,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,30.474310,8.770017,8.700827,8.749124,8.720934,178.231694,199.723901,9.011728e-20,0.0,0.001813,...,-0.210776,-0.182267,43.201399,60.0,0.0,0.0,-1.173679e-05,85.4,0.0,156
99,30.458116,8.536275,8.714917,8.773614,8.702142,193.271535,201.506292,7.890852e-20,0.0,0.001790,...,-0.187443,-0.170508,43.196128,60.0,0.0,0.0,9.895865e-07,85.4,0.0,156
99,30.480659,8.721801,8.709552,8.696202,8.750277,179.114332,221.118574,-1.963198e-19,0.0,-0.000268,...,-0.197799,-0.139520,43.197053,60.0,0.0,0.0,8.706569e-06,85.4,0.0,156
99,30.474591,8.740609,8.719405,8.663344,8.708356,197.932121,163.922483,-4.248289e-19,0.0,-0.001411,...,-0.254232,-0.180736,43.189672,60.0,0.0,0.0,1.367703e-05,85.4,0.0,156


In [19]:
X = train_red.drop(['label'],axis=1)

In [20]:
Y = train['label']

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)


In [20]:
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint

In [40]:
rf = RandomForestClassifier()

In [42]:
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [43]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [46]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [48]:
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.5min


MemoryError: 

In [49]:
rf.fit(X_train, Y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [51]:
rf.score(X_test,Y_test)

0.9034247757073844

In [21]:
rf2 = RandomForestClassifier(n_estimators = 200)

In [22]:
rf2.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
rf2.score(X_test,Y_test)

0.9449189095928227

In [20]:
rf3 = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1,n_estimators = 400)

In [21]:
rf3.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 15.7min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [22]:
rf3.score(X_test,Y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.8s
[Parallel(n_jobs=12)]: Done 400 out of 400 | elapsed:    6.0s finished


0.9439268461007592

In [23]:
rf4 = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1, n_estimators = 600)

In [25]:
rf4.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 23.2min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [26]:
rf4.score(X_test,Y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   16.9s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:   23.7s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   27.7s
[Parallel(n_jobs=12)]: Done 600 out of 600 | elapsed:   30.2s finished


0.9452639751552795

In [20]:
rf5 = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1, n_estimators = 800)

In [21]:
rf5.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 31.2min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [22]:
rf5.score(X_test,Y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.2s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   13.1s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   18.1s
[Parallel(n_jobs=12)]: Done 800 out of 800 | elapsed:   18.4s finished


0.9456521739130435

In [20]:
rf5 = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1, n_estimators = 1000)

In [21]:
rf5.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 38.9min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [22]:
rf5.score(X_test,Y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    7.4s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   32.8s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:  1.1min finished


0.9460403726708074

In [20]:
rf5 = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1, n_estimators = 2000)

In [21]:
rf5.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 30.7min


MemoryError: could not allocate 25952256 bytes

In [3]:
rf5 = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1, n_estimators = 1000, min_samples_split = 5,
                            min_samples_leaf = 2, bootstrap = False)

In [22]:
rf5.fit(X_train,Y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 44.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 57.6min finished


RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [23]:
rf5.score(X_test,Y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    7.4s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   16.1s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   24.1s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:   27.6s finished


0.9559178743961353

In [25]:
pred = rf5.predict_proba(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    6.8s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   17.3s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   25.5s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:   31.1s finished


In [26]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()

In [27]:
submission = submission.groupby('id').mean()

In [28]:
submission.to_csv('submission13.csv', index=True)

In [29]:
submission

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
828,0.000067,0.000040,0.000073,0.000063,0.000025,0.000040,0.000052,0.000013,0.000340,0.000045,...,0.000273,0.000183,0.000222,0.000227,0.000277,0.000338,0.000327,0.000195,0.000012,0.000592
829,0.000000,0.000042,0.000043,0.000027,0.000028,0.000027,0.000052,0.000020,0.000125,0.000098,...,0.000040,0.000070,0.000157,0.000062,0.000092,0.000080,0.000053,0.000060,0.000027,0.000107
830,0.000020,0.000000,0.000010,0.000030,0.000010,0.000157,0.000000,0.000000,0.000340,0.000115,...,0.000035,0.000005,0.000020,0.000020,0.000085,0.000020,0.000067,0.000082,0.000110,0.000032
831,0.000347,0.000163,0.000277,0.000258,0.056668,0.064283,0.064763,0.071025,0.004100,0.000053,...,0.000143,0.002747,0.001172,0.003318,0.001277,0.000867,0.000172,0.000080,0.000000,0.000337
832,0.000590,0.000517,0.000547,0.000877,0.000572,0.000443,0.000538,0.000480,0.000160,0.003015,...,0.000005,0.000200,0.000000,0.000092,0.000070,0.000045,0.000005,0.000030,0.000005,0.000018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,0.002393,0.001613,0.003557,0.002620,0.003008,0.006188,0.002842,0.002577,0.000777,0.003578,...,0.000070,0.000850,0.000130,0.000755,0.000108,0.000035,0.000020,0.000055,0.000010,0.000158
1544,0.000288,0.000138,0.000143,0.000163,0.000148,0.000045,0.000067,0.000005,0.002855,0.000107,...,0.786638,0.001138,0.065512,0.001058,0.011438,0.013070,0.006710,0.001237,0.000055,0.002357
1545,0.000265,0.000268,0.000207,0.000135,0.000017,0.000018,0.000052,0.000042,0.005712,0.000302,...,0.001173,0.000942,0.001008,0.001140,0.002052,0.003222,0.001322,0.002302,0.000057,0.005967
1546,0.000030,0.000147,0.000062,0.000005,0.000000,0.000000,0.000010,0.000000,0.000000,0.000010,...,0.000040,0.000000,0.000067,0.000000,0.000015,0.000010,0.000033,0.000005,0.000000,0.000000
