In [1]:
from sklearn.decomposition import PCA # 차원축소
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

from imblearn.under_sampling import * # 임벨런스
from imblearn.over_sampling import * # 임벨런스
from imblearn.combine import * # 임벨런스

import os
import copy
import pandas as pd

from jtlearn import Preprocessing
from ensemble import BinaryCalssifier, Regressor

In [2]:
# Load Data
save_path = 'submission/'
base_path = 'data/'

train = pd.read_csv(base_path + 'train.csv')
test = pd.read_csv(base_path + 'test.csv')
submission = pd.read_csv(base_path + 'sample_submission.csv')

# Preprocessing

train = train.fillna(0)

X = train.drop(columns=["ID", "Y_LABEL"])
y = train["Y_LABEL"]
test = test.drop(columns=['ID'])

train_num_cols = X.drop(columns=['COMPONENT_ARBITRARY']).columns.tolist()
test_num_cols = test.drop(columns=['COMPONENT_ARBITRARY']).columns.tolist()

ss = StandardScaler()
ss2 = StandardScaler()

ss2.fit(X[test_num_cols])
X[train_num_cols] = ss.fit_transform(X[train_num_cols])
test[test_num_cols] = ss2.transform(test[test_num_cols])

X.COMPONENT_ARBITRARY = X.COMPONENT_ARBITRARY.map({"COMPONENT1" : 1, "COMPONENT2" : 2, "COMPONENT3" : 3, "COMPONENT4" : 4})
test.COMPONENT_ARBITRARY = test.COMPONENT_ARBITRARY.map({"COMPONENT1" : 1, "COMPONENT2" : 2, "COMPONENT3" : 3, "COMPONENT4" : 4})

In [3]:
# preparation for preprocessing
sampler_dic = {
    "under": {
        'RandomUnderSampler': RandomUnderSampler,
        'TomekLinks': TomekLinks,
         # 'CondensedNearestNeighbour': CondensedNearestNeighbour, 
        'OneSidedSelection': OneSidedSelection,
        'EditedNearestNeighbours': EditedNearestNeighbours,
        'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule
    },

    "over": {
        'RandomOverSampler': RandomOverSampler,
        'ADASYN': ADASYN,
        'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule
    },

    "hybrid": {
        'SMOTEENN': SMOTEENN,
        'SMOTETomek': SMOTETomek
    }
}
# ("under", "RandomUnderSampler")
# ("over", "RandomOverSampler")
# ("hybrid", "SMOTEENN")
# sampler 하나 
variable_dict = {
    "categorical_feature": "COMPONENT_ARBITRARY", 
    "test_size": 0.2, 
    "learner": ("classification", "LGBM"), 
    "sampler": ("hybrid", "SMOTEENN"), 
    "random_state_": 42,
    "dimensionality": PCA
}

In [4]:
first_try = Preprocessing(**variable_dict)
print()

# 샘플링 그룹핑 스플릿
X2, y2 = first_try.sampling(X, y)
grouped_dic = first_try.grouping_df(X2, y2, y_column='Y_LABEL')
split_X_y_bundle = first_try.split_X_y_bundle(grouped_dic)
print()


SMOTEENN(n_jobs=-1, random_state=42) completed resampling X and y
COMPONENT_ARBITRARY
dividing my df on 1
dividing my df on 2
dividing my df on 3
dividing my df on 4



In [5]:
# 피처임포턴스 확인
result_ = first_try.feature_importance_for_groups(split_X_y_bundle)
features = result_[1]
drop_target_list = first_try.chose_drop_features(features, draw=False)
print()
print(drop_target_list)
print()

f1_score : 0.977
f1_score : 0.975
f1_score : 0.983
f1_score : 0.959

['FTBN', 'H2O', 'FNOX', 'SOOTPERCENTAGE', 'TI', 'FH2O', 'FOXID', 'FOPTIMETHGLY', 'U14', 'BE', 'V100', 'U6', 'CO', 'LI', 'FUEL', 'U4', 'U25', 'U20', 'U75', 'U100', 'CD', 'FSO4', 'V', 'AG', 'U50']



In [6]:
drop_list_reg = list(set(drop_target_list) - set(test.columns))
drop_list_test = list(set(drop_target_list) & set(test.columns))
X3 = X2.drop(columns=drop_list_reg)
X3

Unnamed: 0,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,AL,B,BA,CA,CO,CR,CU,FE,H2O,K,MG,MN,MO,NA,NI,P,PB,PQINDEX,S,SB,SI,SN,TI,V,V40,ZN
0,3,-0.393763,-0.669043,-0.051413,-0.340760,-0.150214,-0.111628,0.281646,-0.238453,1.141962,-0.089633,0.339245,0.336858,1.331290,-0.041588,1.669706,-0.167157,1.186914,-0.384284,0.672881,1.384414,1.845136,-0.160812,5.293270,1.001652,-0.174727,2.006643,0.302478,0.622282,-0.106550,0.899892,-0.966002
1,2,-0.426022,1.853268,3.715319,-0.022576,-0.150214,-0.123127,-0.437686,-0.238453,1.087302,-0.089633,-0.115388,-0.027612,-0.330406,-0.041588,-0.210453,-0.278723,-0.250456,-0.400998,-0.164468,-0.191804,-0.598302,0.033010,-0.259244,-1.170187,-0.174727,-0.179489,-0.252439,-0.102635,-0.106550,-1.317376,0.119147
2,3,-0.410367,0.339881,-0.051413,0.415608,-0.150214,-0.146124,-0.418245,-0.238453,-0.834591,-0.089633,-0.115388,-0.244743,-0.268515,-0.041588,-0.210453,-0.287305,-0.250456,-0.400998,-0.108644,-0.191804,-1.027099,-0.160812,-0.254663,0.455823,-0.174727,-0.174370,-0.252439,-0.102635,-0.106550,0.789028,-1.061916
3,3,0.411276,0.087650,-0.051413,-0.340760,-0.150214,-0.146124,-0.612659,-0.238453,-0.914895,-0.089633,0.024499,0.701328,-0.022824,-0.041588,-0.210453,-0.261559,0.198722,-0.400998,-0.108644,0.859008,-0.812700,-0.160812,-0.217362,0.421508,-0.174727,-0.169250,-0.252439,-0.102635,-0.106550,0.631804,-0.939672
4,3,-0.468243,1.096575,-0.222628,-0.022576,-0.150214,-0.111628,-0.602938,0.105735,0.646643,-0.089633,-0.115388,-0.268007,-0.276017,-0.041588,-0.210453,-0.287305,-0.250456,-0.400998,-0.164468,-0.191804,-1.089115,-0.160812,-0.180717,0.129828,-0.174727,-0.179489,-0.252439,-0.102635,-0.106550,1.299000,-1.099530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22309,1,-0.189589,-1.662718,-0.142197,0.646648,-0.150214,0.624704,-0.565231,0.429113,-0.717098,-0.089633,-0.045445,0.003660,-0.274368,-0.041588,0.699019,-0.166897,-0.163336,2.587186,-0.106957,0.062957,-0.015643,0.693807,-0.262893,-0.825583,0.568990,-0.123017,0.571548,-0.102635,1.996846,0.016880,1.019857
22310,3,-0.209092,0.305027,-0.246288,-0.340760,-0.150214,-0.123127,-0.572115,0.354800,-0.745044,-0.089633,-0.115388,-0.140210,-0.210817,-0.041588,-0.191208,-0.217463,-0.148207,-0.309955,-0.060535,-0.191804,0.283263,-0.160812,-0.213244,-0.733527,-0.174727,-0.144526,-0.013321,-0.102635,-0.106550,-0.941650,1.230105
22311,3,-0.215672,1.801752,-0.495120,2.740946,-0.150214,1.169934,-0.618409,-0.168156,-0.902890,-0.089633,-0.045445,-0.214157,-0.156866,-0.041588,-0.210453,-0.276970,-0.213760,-0.400998,-0.085842,-0.191804,-1.062114,-0.083694,-0.180376,0.571305,1.865800,-0.105939,-0.195770,-0.102635,-0.106550,0.371037,-1.072193
22312,1,-0.106748,-1.496417,-0.147370,-0.251598,-0.150214,0.101291,-0.077057,0.504764,-0.254774,0.802118,-0.055245,-0.252497,-0.314804,-0.041588,0.207360,-0.167157,-0.185795,3.222531,-0.180111,-0.191804,0.254767,0.033010,-0.261836,-0.821648,0.088943,-0.134846,0.302478,-0.102635,-0.106550,0.019453,1.274882


In [7]:
drop_list_test

['H2O', 'TI', 'V', 'AG', 'CO']

In [8]:
test

Unnamed: 0,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN
0,1,-0.226304,0.592112,-0.340760,-0.150214,-0.089633,-0.115388,-0.260252,-0.311651,-0.041588,-0.250456,-0.400998,-0.191804,-0.265133,-0.102635,-0.10655,-0.363951,0.944762
1,3,-0.083512,-0.669043,-0.340760,-0.150214,-0.089633,-0.045445,-0.260252,0.187233,-0.041588,0.019051,-0.400998,-0.191804,1.516121,0.042348,-0.10655,0.353638,-1.084484
2,2,-0.276115,-0.921274,-0.340760,-0.150214,-0.089633,-0.115388,-0.143932,-0.324780,-0.041588,-0.250456,-0.400998,-0.191804,-0.264479,-0.102635,-0.10655,-1.311328,0.235748
3,3,-0.413213,-1.173506,-0.340760,-0.150214,-0.089633,-0.010473,-0.236988,-0.028450,-0.041588,0.108886,-0.350857,-0.191804,4.968037,-0.102635,-0.10655,0.674134,-0.930269
4,2,1.204694,-0.164581,-0.340760,-0.150214,-0.089633,-0.115388,-0.221479,-0.309776,-0.041588,-0.250456,-0.400998,-0.191804,-0.261207,-0.102635,-0.10655,-0.926330,-0.225017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,3,-0.339683,0.087650,-0.340760,-0.150214,-0.089633,-0.010473,0.740102,1.629495,-0.041588,5.588859,-0.317430,0.333602,1.036452,-0.102635,-0.10655,-0.769106,1.080171
6037,3,0.233617,0.592112,-0.340760,-0.150214,-0.089633,0.059471,-0.252497,1.046214,-0.041588,0.198722,-0.400998,-0.191804,0.181816,-0.102635,-0.10655,0.573349,-1.082603
6038,3,0.279633,0.087650,-0.340760,-0.150214,-0.089633,-0.115388,-0.268007,-0.234755,-0.041588,-0.250456,-0.400998,-0.191804,-0.226524,-0.102635,-0.10655,3.887155,-1.082603
6039,2,-0.422701,-0.164581,-0.340760,-0.150214,-0.089633,-0.115388,0.212784,-0.330406,-0.041588,-0.250456,-0.400998,-0.191804,-0.268405,-0.102635,-0.10655,-1.174261,-0.208091


In [9]:
resid_cols = X3.drop(columns=test.columns).columns
print('length of residual columns is ', len(resid_cols))
print('residual columns are ', resid_cols)

length of residual columns is  14
residual columns are  Index(['SAMPLE_TRANSFER_DAY', 'AL', 'B', 'BA', 'CA', 'K', 'MG', 'NA', 'P',
       'PB', 'S', 'SB', 'SI', 'SN'],
      dtype='object')


In [10]:
X_reg = X[test.columns]
X_reg

Unnamed: 0,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN
0,3,-0.393763,-0.669043,-0.340760,-0.150214,-0.089633,0.339245,0.336858,1.331290,-0.041588,1.186914,-0.384284,1.384414,5.293270,0.622282,-0.10655,0.899892,-0.966002
1,2,-0.426022,1.853268,-0.022576,-0.150214,-0.089633,-0.115388,-0.027612,-0.330406,-0.041588,-0.250456,-0.400998,-0.191804,-0.259244,-0.102635,-0.10655,-1.317376,0.119147
2,2,-0.173409,0.339881,-0.340760,-0.150214,-0.089633,-0.080416,-0.252497,-0.326655,-0.041588,-0.250456,-0.400998,-0.191804,-0.260552,-0.102635,-0.10655,-0.740886,-0.332215
3,3,1.006399,-0.921274,-0.340760,-0.150214,-0.089633,-0.115388,-0.260252,-0.264764,-0.041588,-0.160621,-0.400998,-0.191804,-0.242884,-0.102635,-0.10655,0.482642,-1.093888
4,3,0.191634,0.339881,-0.340760,-0.150214,-0.089633,-0.115388,-0.268007,-0.200996,-0.041588,-0.250456,-0.400998,-0.191804,-0.129674,-0.102635,-0.10655,0.478611,-0.866326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14090,3,-0.362928,0.087650,-0.340760,-0.150214,-0.089633,-0.115388,-0.244743,-0.291021,-0.041588,-0.250456,-0.400998,-0.191804,-0.248773,-0.102635,-0.10655,0.524972,-1.076961
14091,1,-0.085884,-0.164581,-0.340760,-0.150214,-0.089633,-0.115388,-0.252497,-0.311651,-0.041588,-0.250456,3.342831,-0.191804,-0.265787,-0.102635,-0.10655,0.164162,1.540935
14092,3,-0.322130,-1.425737,0.295608,-0.150214,-0.089633,0.024499,-0.213724,0.444177,-0.041588,0.378393,-0.233862,0.070899,0.150406,-0.102635,-0.10655,-1.115806,1.339703
14093,2,-0.153722,-1.173506,0.295608,-0.150214,-0.089633,-0.115388,1.050289,-0.298523,-0.041588,-0.250456,-0.384284,-0.191804,-0.264479,-0.102635,-0.10655,-1.311328,0.119147


In [13]:
from sklearn.model_selection import train_test_split

reg_dict = {}
for col in resid_cols:
    print(f"--------------- {col} ---------------")
    reg = Regressor(ensemble='stacking')
    y_reg = X[col]
    # train-validation split
    X_reg_train, X_reg_val, y_reg_train, y_reg_val = train_test_split(X_reg, y_reg, test_size=0.2, random_state=69)

    # model fitting
    reg.fit(X_reg_train, y_reg_train, n_trials=100, cv=5, optimize=False)

    # prediction
    y_reg_train_pred = reg.predict(X_reg_train)
    y_reg_pred = reg.predict(X_reg_val)
    
    # scoring
    score_train = reg.score(y_reg_train, y_reg_train_pred)
    score_val = reg.score(y_reg_val, y_reg_pred)
    reg_dict[col] = reg
    print(f"--------------- {col} ---------------")
    print("Train R^2 score is %.4f" % (score_train))
    print("Validation R^2 score is %.4f" % (score_val))
    print()

[32m[I 2022-11-23 21:02:05,671][0m A new study created in memory with name: no-name-7fcc0bd5-b89b-4962-9d9c-3fe9eac82da1[0m


--------------- SAMPLE_TRANSFER_DAY ---------------


[32m[I 2022-11-23 21:02:07,190][0m Trial 0 finished with value: 0.12707057582432935 and parameters: {'n_estimators': 93, 'max_depth': 8, 'min_samples_split': 42, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.12707057582432935.[0m
[32m[I 2022-11-23 21:02:16,778][0m Trial 1 finished with value: 0.1411168737901867 and parameters: {'n_estimators': 551, 'max_depth': 10, 'min_samples_split': 49, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.1411168737901867.[0m
[32m[I 2022-11-23 21:02:25,668][0m Trial 2 finished with value: 0.16078511488608263 and parameters: {'n_estimators': 541, 'max_depth': 10, 'min_samples_split': 50, 'min_samples_leaf': 12}. Best is trial 2 with value: 0.16078511488608263.[0m
[32m[I 2022-11-23 21:02:38,602][0m Trial 3 finished with value: 0.17108088336726857 and parameters: {'n_estimators': 756, 'max_depth': 11, 'min_samples_split': 13, 'min_samples_leaf': 14}. Best is trial 3 with value: 0.17108088336726857.[0m
[32m[I 2022-11-23 21:02:53,76

Best trial: score 0.18443179467729134,
params: {'n_estimators': 972, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 4}


[32m[I 2022-11-23 21:30:15,151][0m A new study created in memory with name: no-name-465d5a68-0226-483c-b239-569985e3f1fb[0m


--------------- SAMPLE_TRANSFER_DAY ---------------
Train R^2 score is 0.5500
Validation R^2 score is 0.0936

--------------- AL ---------------


[32m[I 2022-11-23 21:30:24,047][0m Trial 0 finished with value: 0.0011938430206484662 and parameters: {'n_estimators': 365, 'max_depth': 16, 'min_samples_split': 9, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0011938430206484662.[0m
[32m[I 2022-11-23 21:30:25,302][0m Trial 1 finished with value: 0.0007653606855621886 and parameters: {'n_estimators': 55, 'max_depth': 13, 'min_samples_split': 43, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0011938430206484662.[0m
[32m[I 2022-11-23 21:30:31,796][0m Trial 2 finished with value: 0.006810898495318951 and parameters: {'n_estimators': 426, 'max_depth': 9, 'min_samples_split': 25, 'min_samples_leaf': 11}. Best is trial 2 with value: 0.006810898495318951.[0m
[32m[I 2022-11-23 21:30:36,337][0m Trial 3 finished with value: 0.005876284053923797 and parameters: {'n_estimators': 242, 'max_depth': 11, 'min_samples_split': 44, 'min_samples_leaf': 5}. Best is trial 2 with value: 0.006810898495318951.[0m
[32m[I 2022-11-2

Best trial: score 0.011310478115940104,
params: {'n_estimators': 584, 'max_depth': 8, 'min_samples_split': 22, 'min_samples_leaf': 20}


[32m[I 2022-11-23 21:42:35,330][0m A new study created in memory with name: no-name-01af3162-bb0d-46d6-bf7b-c19fc54ddabb[0m


--------------- AL ---------------
Train R^2 score is 0.0960
Validation R^2 score is 0.0590

--------------- B ---------------


[32m[I 2022-11-23 21:42:43,602][0m Trial 0 finished with value: 0.07274436652443343 and parameters: {'n_estimators': 469, 'max_depth': 14, 'min_samples_split': 50, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.07274436652443343.[0m
[32m[I 2022-11-23 21:42:50,796][0m Trial 1 finished with value: 0.0694721400328209 and parameters: {'n_estimators': 423, 'max_depth': 10, 'min_samples_split': 18, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.07274436652443343.[0m
[32m[I 2022-11-23 21:43:10,060][0m Trial 2 finished with value: 0.07243848246186355 and parameters: {'n_estimators': 970, 'max_depth': 16, 'min_samples_split': 32, 'min_samples_leaf': 11}. Best is trial 0 with value: 0.07274436652443343.[0m
[32m[I 2022-11-23 21:43:20,319][0m Trial 3 finished with value: 0.07262442393522898 and parameters: {'n_estimators': 554, 'max_depth': 12, 'min_samples_split': 36, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.07274436652443343.[0m
[32m[I 2022-11-23 21:43:38

Best trial: score 0.07399377405977747,
params: {'n_estimators': 617, 'max_depth': 15, 'min_samples_split': 26, 'min_samples_leaf': 19}


[32m[I 2022-11-23 22:00:34,491][0m A new study created in memory with name: no-name-74292713-6946-42cd-8494-a54d0afc2915[0m


--------------- B ---------------
Train R^2 score is 0.2294
Validation R^2 score is 0.0864

--------------- BA ---------------


[32m[I 2022-11-23 22:00:41,151][0m Trial 0 finished with value: -0.01748384648040433 and parameters: {'n_estimators': 426, 'max_depth': 10, 'min_samples_split': 31, 'min_samples_leaf': 1}. Best is trial 0 with value: -0.01748384648040433.[0m
[32m[I 2022-11-23 22:00:58,923][0m Trial 1 finished with value: 0.08457263203940849 and parameters: {'n_estimators': 949, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 14}. Best is trial 1 with value: 0.08457263203940849.[0m
[32m[I 2022-11-23 22:01:10,794][0m Trial 2 finished with value: 0.07058592594697004 and parameters: {'n_estimators': 571, 'max_depth': 14, 'min_samples_split': 40, 'min_samples_leaf': 7}. Best is trial 1 with value: 0.08457263203940849.[0m
[32m[I 2022-11-23 22:01:12,186][0m Trial 3 finished with value: -0.010595271494344093 and parameters: {'n_estimators': 90, 'max_depth': 8, 'min_samples_split': 21, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.08457263203940849.[0m
[32m[I 2022-11-23 22:01:

KeyboardInterrupt: 

In [None]:
import copy
temp_test = copy.deepcopy(test)

for col, reg in reg_dict.items():
    test[col] = reg.predict(temp_test)

X4 = X3.drop(columns=drop_list_test)
test = test.drop(columns=drop_list_test)

In [None]:
test

In [None]:
X4

In [None]:
test

In [None]:
set(X4.columns) - set(test.columns)

In [None]:
set(test.columns) - set(X4.columns) 

In [None]:
test.columns

In [None]:
count0 = 0
count1 = 0

for y in y2:
    if y == 0:
        count0 += 1
    elif y == 1:
        count1 += 1

print(count0, count1)

In [None]:
# sampler 하나 
variable_dict = {
    "categorical_feature": "COMPONENT_ARBITRARY", 
    "test_size": 0.1, 
    "learner": ("classification", "LGBM"), 
    "sampler": ("hybrid", "SMOTEENN"), 
    "random_state_": 42,
    "dimensionality": PCA
}

sec_try = Preprocessing(**variable_dict)
print()

test = test[X4.columns]

# 샘플링 그룹핑 스플릿
grouped_dic = sec_try.grouping_df(X4, y2, y_column='Y_LABEL')        
split_X_y_bundle = first_try.split_X_y_bundle(grouped_dic)

test_final = pd.DataFrame()

for cat, (X_train, X_val, y_train, y_val) in split_X_y_bundle.items():
    test_temp = test[test.COMPONENT_ARBITRARY == cat]
    test_temp = test_temp.drop(columns=['COMPONENT_ARBITRARY'])
    
    # model initializing
    clf = BinaryCalssifier()

    # model training
    clf.fit(X_train, y_train, n_trials=100, cv=5)

    # prediction
    y_train_pred = clf.predict(X_train)
    y_pred = clf.predict(X_val)
    
    # scoring
    score_train = clf.score(y_train, y_train_pred)
    score_val = clf.score(y_val, y_pred)
    print("Train F1_score is %.4f" % (score_train))
    print("Validation F1_score is %.4f" % (score_val))

    # fill prediction value in test data
    test_temp['Y_LABEL'] = clf.predict(test_temp)
    test_final = pd.concat([test_final, test_temp], axis=0)

test_final = test_final.sort_index()

if 'submission_oil.csv' in os.listdir(save_path):
    count = 0
    for name in os.listdir(save_path):
        if 'submission_oil' in name:
            count += 1
    filename = f"submission_oil{count + 1}.csv"
else:
    filename = 'submission_oil.csv'

# Export submission file
submission.Y_LABEL = test_final.Y_LABEL
submission.to_csv(save_path + filename, index=False)