In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from tqdm import tqdm
from sklearn.metrics import f1_score

from Bio.SeqUtils.ProtParam import ProteinAnalysis

import optuna
from lightgbm import LGBMClassifier
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss
from tqdm import tqdm

In [15]:
train = pd.read_csv('data/train.csv')

In [16]:
def get_dic(data):
    vocab = {}
    for name in data:
        if name not in vocab:
            vocab[name]=0
        vocab[name] += 1
    vocab_sorted = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
    token_dic = {}
    i = 1
    token_dic['Unknown'] = 0
    for (name, freq) in vocab_sorted:
        token_dic[name] = i
        i += 1
    return token_dic

def dic_except(dic, a):
    try:
        return dic[a]
    except:
        return dic['Unknown']

dic_disease_type = get_dic(train['disease_type'])
dic_disease_state = get_dic(train['disease_state'])

train['disease_type'] = train['disease_type'].map(lambda a: dic_except(dic_disease_type, a))
train['disease_state'] = train['disease_state'].map(lambda a: dic_except(dic_disease_state, a))

In [17]:
def get_peptide_feature(seq): # CTD descriptor
    CTD = {'hydrophobicity': {1: ['R', 'K', 'E', 'D', 'Q', 'N'], 2: ['G', 'A', 'S', 'T', 'P', 'H', 'Y'], 3: ['C', 'L', 'V', 'I', 'M', 'F', 'W']},
           'normalized.van.der.waals': {1: ['G', 'A', 'S', 'T', 'P', 'D', 'C'], 2: ['N', 'V', 'E', 'Q', 'I', 'L'], 3: ['M', 'H', 'K', 'F', 'R', 'Y', 'W']},
           'polarity': {1: ['L', 'I', 'F', 'W', 'C', 'M', 'V', 'Y'], 2: ['P', 'A', 'T', 'G', 'S'], 3: ['H', 'Q', 'R', 'K', 'N', 'E', 'D']},
           'polarizability': {1: ['G', 'A', 'S', 'D', 'T'], 2: ['C', 'P', 'N', 'V', 'E', 'Q', 'I', 'L'], 3: ['K', 'M', 'H', 'F', 'R', 'Y', 'W']},
           'charge': {1: ['K', 'R'], 2: ['A', 'N', 'C', 'Q', 'G', 'H', 'I', 'L', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'], 3: ['D', 'E']},
           'secondary': {1: ['E', 'A', 'L', 'M', 'Q', 'K', 'R', 'H'], 2: ['V', 'I', 'Y', 'C', 'W', 'F', 'T'], 3: ['G', 'N', 'P', 'S', 'D']},
           'solvent': {1: ['A', 'L', 'F', 'C', 'G', 'I', 'V', 'W'], 2: ['R', 'K', 'Q', 'E', 'N', 'D'], 3: ['M', 'S', 'P', 'T', 'H', 'Y']}}
    
    seq = str(seq)
    sequencelength = len(seq)
    Sequence_group = []
    
    for AAproperty in CTD:
        propvalues = ""
        for letter in seq:
            if letter in CTD[AAproperty][1]:
                propvalues += "1"
            elif letter in CTD[AAproperty][2]:
                propvalues += "2"
            elif letter in CTD[AAproperty][3]:
                propvalues += "3"
        abpos_1 = [i for i in range(len(propvalues)) if propvalues.startswith("1", i)]
        abpos_1 = [x+1 for x in abpos_1]
        abpos_1.insert(0, "-")
        abpos_2 = [i for i in range(len(propvalues)) if propvalues.startswith("2", i)]
        abpos_2 = [x+1 for x in abpos_2]
        abpos_2.insert(0, "-")
        abpos_3 = [i for i in range(len(propvalues)) if propvalues.startswith("3", i)]
        abpos_3 = [x+1 for x in abpos_3]
        abpos_3.insert(0, "-")
        property_group1_length = propvalues.count("1")
        
        if property_group1_length == 0:
            Sequence_group.extend([0, 0, 0, 0, 0])
        elif property_group1_length == 1:
            Sequence_group.append((abpos_1[1]/sequencelength)*100)
            Sequence_group.append((abpos_1[1]/sequencelength)*100)
            Sequence_group.append((abpos_1[1]/sequencelength)*100)
            Sequence_group.append((abpos_1[1]/sequencelength)*100)
            Sequence_group.append((abpos_1[1]/sequencelength)*100)
        elif property_group1_length == 2:
            Sequence_group.append((abpos_1[1]/sequencelength)*100)
            Sequence_group.append((abpos_1[1]/sequencelength)*100)
            Sequence_group.append((abpos_1[round((0.5*property_group1_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_1[round((0.75*property_group1_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_1[property_group1_length]/sequencelength)*100)
        else:
            Sequence_group.append((abpos_1[1]/sequencelength)*100)
            Sequence_group.append((abpos_1[round((0.25*property_group1_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_1[round((0.5*property_group1_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_1[round((0.75*property_group1_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_1[property_group1_length]/sequencelength)*100)

        property_group2_length = propvalues.count("2")
        if property_group2_length == 0:
            Sequence_group.extend([0, 0, 0, 0, 0])
        elif property_group2_length == 1:
            Sequence_group.append((abpos_2[1]/sequencelength)*100)
            Sequence_group.append((abpos_2[1]/sequencelength)*100)
            Sequence_group.append((abpos_2[1]/sequencelength)*100)
            Sequence_group.append((abpos_2[1]/sequencelength)*100)
            Sequence_group.append((abpos_2[1]/sequencelength)*100)
        elif property_group2_length == 2:
            Sequence_group.append((abpos_2[1]/sequencelength)*100)
            Sequence_group.append((abpos_2[1]/sequencelength)*100)
            Sequence_group.append((abpos_2[round((0.5*property_group2_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_2[round((0.75*property_group2_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_2[property_group2_length]/sequencelength)*100)
        else:
            Sequence_group.append((abpos_2[1]/sequencelength)*100)
            Sequence_group.append((abpos_2[round((0.25*property_group2_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_2[round((0.5*property_group2_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_2[round((0.75*property_group2_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_2[property_group2_length]/sequencelength)*100)

        property_group3_length = propvalues.count("3")
        if property_group3_length == 0:
            Sequence_group.extend([0, 0, 0, 0, 0])
        elif property_group3_length == 1:
            Sequence_group.append((abpos_3[1]/sequencelength)*100)
            Sequence_group.append((abpos_3[1]/sequencelength)*100)
            Sequence_group.append((abpos_3[1]/sequencelength)*100)
            Sequence_group.append((abpos_3[1]/sequencelength)*100)
            Sequence_group.append((abpos_3[1]/sequencelength)*100)
        elif property_group3_length == 2:
            Sequence_group.append((abpos_3[1]/sequencelength)*100)
            Sequence_group.append((abpos_3[1]/sequencelength)*100)
            Sequence_group.append((abpos_3[round((0.5*property_group3_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_3[round((0.75*property_group3_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_3[property_group3_length]/sequencelength)*100)
        else:
            Sequence_group.append((abpos_3[1]/sequencelength)*100)
            Sequence_group.append((abpos_3[round((0.25*property_group3_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_3[round((0.5*property_group3_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_3[round((0.75*property_group3_length)-0.1)]/sequencelength)*100)
            Sequence_group.append((abpos_3[property_group3_length]/sequencelength)*100)
    return Sequence_group

In [18]:
def get_protein_feature(seq):
    protein_feature = []
    protein_feature.append(ProteinAnalysis(seq).isoelectric_point())
    protein_feature.append(ProteinAnalysis(seq).aromaticity())
    protein_feature.append(ProteinAnalysis(seq).gravy())
    protein_feature.append(ProteinAnalysis(seq).instability_index())
    return protein_feature
    
def get_preprocessing(data_type, new_df):   
    protein_features = []
    epitope_features = []
    disease_features = []
        
    for epitope, antigen, d_type, d_state in tqdm(zip(new_df['epitope_seq'], new_df['antigen_seq'], new_df['disease_type'], new_df['disease_state'])):        

        protein_features.append(get_protein_feature(antigen))
        epitope_features.append(get_peptide_feature(epitope))
        disease_features.append([d_type, d_state])
    
    label_list = None
    if data_type != 'test':
        label_list = []
        for label in new_df['label']:
            label_list.append(label)
    print(f'{data_type} dataframe preprocessing was done.')
    return protein_features, epitope_features, disease_features, label_list

In [19]:
train, val = train_test_split(train, train_size=0.8, random_state=12)

train_protein_features, train_epitope_features, train_disease_features, train_label_list = get_preprocessing('train', train)
val_protein_features, val_epitope_features, val_disease_features, val_label_list = get_preprocessing('val', val)

152648it [05:04, 501.91it/s]
33it [00:00, 329.85it/s]

train dataframe preprocessing was done.


38163it [01:16, 500.69it/s]

val dataframe preprocessing was done.





In [56]:
train_protein_features = np.array(train_protein_features)
train_epitope_features = np.array(train_epitope_features)
train_disease_features = np.array(train_disease_features)
X_train = np.concatenate((train_protein_features, train_epitope_features, train_disease_features), axis=1)
y_train = np.array(train_label_list)

val_protein_features = np.array(val_protein_features)
val_epitope_features = np.array(val_epitope_features)
val_disease_features = np.array(val_disease_features)
X_val = np.concatenate((val_protein_features, val_epitope_features, val_disease_features), axis=1)
y_val = np.array(val_label_list)

In [24]:
protein_features, epitope_features, disease_features, label_list = get_preprocessing('train', train)

152648it [05:00, 507.60it/s]

train dataframe preprocessing was done.





In [57]:
protein_features = np.array(protein_features)
epitope_features = np.array(epitope_features)
disease_features = np.array(disease_features)
X = np.concatenate((protein_features, epitope_features, disease_features), axis=1)
y = np.array(label_list)

# Disease 정보 사용한 모델

In [58]:
def objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "objective": "binary",
        "metric": "logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    model = LGBMClassifier(**params_lgb)
    model.fit(
        np.array(X_train),
        np.array(y_train),
        eval_set=[(np.array(X_train), np.array(y_train)), (np.array(X_val), np.array(y_val))],
        early_stopping_rounds=100,
        eval_metric = 'logloss',
        verbose=False,
    )

    lgb_pred = model.predict_proba(np.array(X_val))
    log_score = log_loss(np.array(y_val), lgb_pred)
    
#     preds = model.predict(np.array(X_val))
#     score = log_loss(np.array(y_val), preds)
    
    return log_score

sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)

study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2022-07-26 17:14:27,639][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2022-07-26 17:15:02,735][0m Trial 0 finished with value: 0.13750658818480216 and parameters: {'reg_alpha': 1.12424581642324e-05, 'reg_lambda': 0.08556428806974939, 'max_depth': 15, 'num_leaves': 154, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.40919616423534183, 'subsample_freq': 1, 'min_child_samples': 88, 'max_bin': 380}. Best is trial 0 with value: 0.13750658818480216.[0m
[32m[I 2022-07-26 17:15:33,695][0m Trial 1 finished with value: 0.13683689055745352 and parameters: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 1 with value: 0.13683689055745352.[0m
[32m[I 2022-07-26 17:16:01,708][0m Trial 2 finished with value: 0.1353657018333897 and paramete

Best Score: 0.12558372197949474
Best trial: {'reg_alpha': 1.987904330777592e-05, 'reg_lambda': 0.028054003730936226, 'max_depth': 11, 'num_leaves': 141, 'colsample_bytree': 0.5109126733153162, 'subsample': 0.9787092394351908, 'subsample_freq': 8, 'min_child_samples': 95, 'max_bin': 469}


In [26]:
from sklearn.model_selection import KFold

Best_trial = study.best_trial.params
preds = np.zeros((len(X_test), 2))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
loss=[]  # list contains rmse for each fold
f1 = []
n=0
for trn_idx, test_idx in kf.split(np.array(X), np.array(y)):
    X_tr,X_val=np.array(X)[trn_idx],np.array(X)[test_idx]
    y_tr,y_val=np.array(y)[trn_idx],np.array(y)[test_idx]
    model = LGBMClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(np.array(X_train), np.array(y_train)), (np.array(X_val), np.array(y_val))],early_stopping_rounds=100,eval_metric = 'logloss', verbose=False)
    preds+=model.predict_proba(X_test)
    loss.append(log_loss(y_val, model.predict_proba(X_val)))
    f1.append(f1_score(y_val, model.predict(X_val), average='macro'))
    print(f"fold: {n+1} ==> loss: {loss[n]}, f1_score: {f1[n]}")
    n+=1
print(np.mean(f1))

fold: 1 ==> loss: 0.13731488464999617, f1_score: 0.7991047982185249
fold: 2 ==> loss: 0.13217129734908714, f1_score: 0.7910756959459537
fold: 3 ==> loss: 0.13265068501508862, f1_score: 0.7956988155859193
fold: 4 ==> loss: 0.13263232120105922, f1_score: 0.8017897618502137
fold: 5 ==> loss: 0.13464089496787632, f1_score: 0.7935229515406289
0.796238404628248


In [27]:
from sklearn.model_selection import KFold

Best_trial = study.best_trial.params
preds = np.zeros((len(X_test), 2))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
loss=[]  # list contains rmse for each fold
f1 = []
n=0
for trn_idx, test_idx in kf.split(np.array(X), np.array(y)):
    X_tr,X_val=np.array(X)[trn_idx],np.array(X)[test_idx]
    y_tr,y_val=np.array(y)[trn_idx],np.array(y)[test_idx]
    model = LGBMClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(np.array(X_train), np.array(y_train)), (np.array(X_val), np.array(y_val))],early_stopping_rounds=100,eval_metric = 'logloss', verbose=False)
    preds+=model.predict_proba(X_test)
    loss.append(log_loss(y_val, model.predict_proba(X_val)))
    f1.append(f1_score(y_val, model.predict(X_val)))
    print(f"fold: {n+1} ==> loss: {loss[n]}, f1_score: {f1[n]}")
    n+=1

fold: 1 ==> loss: 0.13731488464999617, f1_score: 0.6287683031869079
fold: 2 ==> loss: 0.13217129734908714, f1_score: 0.611878453038674
fold: 3 ==> loss: 0.13265068501508862, f1_score: 0.6212765957446807
fold: 4 ==> loss: 0.13263232120105922, f1_score: 0.6331724440544098
fold: 5 ==> loss: 0.13464089496787632, f1_score: 0.6167841710256994


In [28]:
from sklearn.model_selection import KFold

Best_trial = study.best_trial.params
preds = np.zeros((len(X_test), 2))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
loss=[]  # list contains rmse for each fold
f1 = []
n=0
for trn_idx, test_idx in kf.split(np.array(X), np.array(y)):
    X_tr,X_val=np.array(X)[trn_idx],np.array(X)[test_idx]
    y_tr,y_val=np.array(y)[trn_idx],np.array(y)[test_idx]
    model = LGBMClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(np.array(X_train), np.array(y_train)), (np.array(X_val), np.array(y_val))],early_stopping_rounds=100,eval_metric = 'logloss', verbose=False)
    preds+=model.predict_proba(X_test)
    loss.append(log_loss(y_val, model.predict_proba(X_val)))
    f1.append(f1_score(y_val, model.predict(X_val), pos_label=0))
    print(f"fold: {n+1} ==> loss: {loss[n]}, f1_score: {f1[n]}")
    n+=1

fold: 1 ==> loss: 0.13731488464999617, f1_score: 0.9694412932501417
fold: 2 ==> loss: 0.13217129734908714, f1_score: 0.9702729388532335
fold: 3 ==> loss: 0.13265068501508862, f1_score: 0.9701210354271579
fold: 4 ==> loss: 0.13263232120105922, f1_score: 0.9704070796460177
fold: 5 ==> loss: 0.13464089496787632, f1_score: 0.9702617320555584


In [29]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf = clf.fit(X, y)

In [33]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)

indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("{}. feature {} ({:.3f})".format(f + 1, indices[f], importances[indices[f]]))
    
# plt.figure()
# plt.title("Feature ranking:")
# plt.bar(range(X.shape[1]), importances[indices],
#         color='r', yerr=std[indices], align='center', alpha=0.3)
# plt.xticks(range(X.shape[1]), X.columns[indices], rotation=45)
# plt.xlim([-1, X.shape[1]])
# plt.show()

Feature ranking:
1. feature 110 (0.114)
2. feature 109 (0.060)
3. feature 0 (0.030)
4. feature 1 (0.029)
5. feature 2 (0.027)
6. feature 69 (0.024)
7. feature 3 (0.022)
8. feature 79 (0.014)
9. feature 19 (0.011)
10. feature 24 (0.010)
11. feature 54 (0.010)
12. feature 94 (0.010)
13. feature 49 (0.010)
14. feature 9 (0.009)
15. feature 92 (0.008)
16. feature 27 (0.008)
17. feature 91 (0.008)
18. feature 39 (0.008)
19. feature 82 (0.008)
20. feature 89 (0.008)
21. feature 87 (0.008)
22. feature 81 (0.008)
23. feature 57 (0.007)
24. feature 97 (0.007)
25. feature 80 (0.007)
26. feature 107 (0.007)
27. feature 25 (0.007)
28. feature 17 (0.007)
29. feature 26 (0.007)
30. feature 96 (0.007)
31. feature 99 (0.007)
32. feature 86 (0.007)
33. feature 4 (0.007)
34. feature 52 (0.007)
35. feature 56 (0.007)
36. feature 78 (0.007)
37. feature 106 (0.007)
38. feature 37 (0.007)
39. feature 95 (0.007)
40. feature 44 (0.007)
41. feature 84 (0.007)
42. feature 85 (0.007)
43. feature 90 (0.007)
44. f

# Test

In [59]:
model = LGBMClassifier(**study.best_trial.params)
model.fit(
    np.array(X),
    np.array(y),
    eval_set=[(np.array(X), np.array(y))],
    early_stopping_rounds=100,
    eval_metric = 'logloss',
    verbose=False,
)

LGBMClassifier(colsample_bytree=0.5109126733153162, max_bin=469, max_depth=11,
               min_child_samples=95, num_leaves=141,
               reg_alpha=1.987904330777592e-05, reg_lambda=0.028054003730936226,
               subsample=0.9787092394351908, subsample_freq=8)

In [62]:
test_df = pd.read_csv('data/test.csv')
test_df['disease_type'] = test_df['disease_type'].map(lambda a: dic_except(dic_disease_type, a))
test_df['disease_state'] = test_df['disease_state'].map(lambda a: dic_except(dic_disease_state, a))

test_protein_features, test_epitope_features, test_disease_features, label_list = get_preprocessing('test', test_df)

test_protein_features = np.array(test_protein_features)
test_epitope_features = np.array(test_epitope_features)
test_disease_features = np.array(test_disease_features)
X_test = np.concatenate((test_protein_features, test_epitope_features, test_disease_features), axis=1)

120944it [26:04, 77.28it/s] 


test dataframe preprocessing was done.


In [63]:
preds_all = model.predict(np.array(X_test))
submit = pd.read_csv('data/sample_submission.csv')
submit['label'] = preds_all
submit.to_csv('submission/lgbm_opt_disease.csv', index=False)
print('Done.')

Done.


In [65]:
submit.label.value_counts()

0    112270
1      8674
Name: label, dtype: int64

# Disease 정보 사용안한 모델

In [43]:
X_train = np.concatenate((train_protein_features, train_epitope_features), axis=1)
y_train = np.array(train_label_list)

val_protein_features = np.array(val_protein_features)
val_epitope_features = np.array(val_epitope_features)
X_val = np.concatenate((val_protein_features, val_epitope_features), axis=1)
y_val = np.array(val_label_list)

X = np.concatenate((protein_features, epitope_features), axis=1)
y = np.array(label_list)

In [36]:
def objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "objective": "binary",
        "metric": "logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    model = LGBMClassifier(**params_lgb)
    model.fit(
        np.array(X_train),
        np.array(y_train),
        eval_set=[(np.array(X_train), np.array(y_train)), (np.array(X_val), np.array(y_val))],
        early_stopping_rounds=100,
        eval_metric = 'logloss',
        verbose=False,
    )

    lgb_pred = model.predict_proba(np.array(X_val))
    log_score = log_loss(np.array(y_val), lgb_pred)
    
#     preds = model.predict(np.array(X_val))
#     score = log_loss(np.array(y_val), preds)
    
    return log_score

sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)

study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2022-07-26 17:00:35,762][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2022-07-26 17:01:08,615][0m Trial 0 finished with value: 0.16404257992767515 and parameters: {'reg_alpha': 1.12424581642324e-05, 'reg_lambda': 0.08556428806974939, 'max_depth': 15, 'num_leaves': 154, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.40919616423534183, 'subsample_freq': 1, 'min_child_samples': 88, 'max_bin': 380}. Best is trial 0 with value: 0.16404257992767515.[0m
[32m[I 2022-07-26 17:01:47,006][0m Trial 1 finished with value: 0.16184348586830047 and parameters: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 1 with value: 0.16184348586830047.[0m
[32m[I 2022-07-26 17:02:22,942][0m Trial 2 finished with value: 0.160400879359445 and parameter

Best Score: 0.14956090899218366
Best trial: {'reg_alpha': 1.987904330777592e-05, 'reg_lambda': 0.028054003730936226, 'max_depth': 11, 'num_leaves': 141, 'colsample_bytree': 0.5109126733153162, 'subsample': 0.9787092394351908, 'subsample_freq': 8, 'min_child_samples': 95, 'max_bin': 469}


In [41]:
X.shape

(152648, 109)

In [50]:
from sklearn.model_selection import KFold

Best_trial = study.best_trial.params
preds = np.zeros((len(X_test), 2))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
loss=[]  # list contains rmse for each fold
f1 = []
n=0
for trn_idx, test_idx in kf.split(X, y):
    X_tr,X_val=np.array(X)[trn_idx],np.array(X)[test_idx]
    y_tr,y_val=np.array(y)[trn_idx],np.array(y)[test_idx]
    model = LGBMClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(np.array(X_train), np.array(y_train)), (np.array(X_val), np.array(y_val))],early_stopping_rounds=100,eval_metric = 'logloss', verbose=False)
#     preds+=model.predict_proba(X_test)
    loss.append(log_loss(y_val, model.predict_proba(X_val)))
    f1.append(f1_score(y_val, model.predict(X_val), average='macro'))
    print(f"fold: {n+1} ==> loss: {loss[n]}, f1_score: {f1[n]}")
    n+=1
print(np.mean(f1))

fold: 1 ==> loss: 0.16151938553027528, f1_score: 0.7560531038767961
fold: 2 ==> loss: 0.15427763369706507, f1_score: 0.7531512319189466
fold: 3 ==> loss: 0.1555143084204252, f1_score: 0.7591204076266131
fold: 4 ==> loss: 0.15704243865426, f1_score: 0.760549232003195
fold: 5 ==> loss: 0.1567530727579862, f1_score: 0.7485983514094695
0.755494465367004


In [51]:
from sklearn.model_selection import KFold

Best_trial = study.best_trial.params
preds = np.zeros((len(X_test), 2))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
loss=[]  # list contains rmse for each fold
f1 = []
n=0
for trn_idx, test_idx in kf.split(np.array(X), np.array(y)):
    X_tr,X_val=np.array(X)[trn_idx],np.array(X)[test_idx]
    y_tr,y_val=np.array(y)[trn_idx],np.array(y)[test_idx]
    model = LGBMClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(np.array(X_train), np.array(y_train)), (np.array(X_val), np.array(y_val))],early_stopping_rounds=100,eval_metric = 'logloss', verbose=False)
#     preds+=model.predict_proba(X_test)
    loss.append(log_loss(y_val, model.predict_proba(X_val)))
    f1.append(f1_score(y_val, model.predict(X_val)))
    print(f"fold: {n+1} ==> loss: {loss[n]}, f1_score: {f1[n]}")
    n+=1

fold: 1 ==> loss: 0.16151938553027528, f1_score: 0.546632723906503
fold: 2 ==> loss: 0.15427763369706507, f1_score: 0.5388059701492537
fold: 3 ==> loss: 0.1555143084204252, f1_score: 0.5509761388286335
fold: 4 ==> loss: 0.15704243865426, f1_score: 0.554268148498463
fold: 5 ==> loss: 0.1567530727579862, f1_score: 0.5307125307125307


In [53]:
from sklearn.model_selection import KFold

Best_trial = study.best_trial.params
preds = np.zeros((len(X_test), 2))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
loss=[]  # list contains rmse for each fold
f1 = []
n=0
for trn_idx, test_idx in kf.split(np.array(X), np.array(y)):
    X_tr,X_val=np.array(X)[trn_idx],np.array(X)[test_idx]
    y_tr,y_val=np.array(y)[trn_idx],np.array(y)[test_idx]
    model = LGBMClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(np.array(X_train), np.array(y_train)), (np.array(X_val), np.array(y_val))],early_stopping_rounds=100,eval_metric = 'logloss', verbose=False)
#     preds+=model.predict_proba(X_test)
    loss.append(log_loss(y_val, model.predict_proba(X_val)))
    f1.append(f1_score(y_val, model.predict(X_val), pos_label=0))
    print(f"fold: {n+1} ==> loss: {loss[n]}, f1_score: {f1[n]}")
    n+=1

fold: 1 ==> loss: 0.16151938553027528, f1_score: 0.9654734838470894
fold: 2 ==> loss: 0.15427763369706507, f1_score: 0.9674964936886397
fold: 3 ==> loss: 0.1555143084204252, f1_score: 0.9672646764245928
fold: 4 ==> loss: 0.15704243865426, f1_score: 0.9668303155079272
fold: 5 ==> loss: 0.1567530727579862, f1_score: 0.9664841721064084


In [54]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf = clf.fit(X, y)
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)

indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("{}. feature {} ({:.3f})".format(f + 1, indices[f], importances[indices[f]]))
    
# plt.figure()
# plt.title("Feature ranking:")
# plt.bar(range(X.shape[1]), importances[indices],
#         color='r', yerr=std[indices], align='center', alpha=0.3)
# plt.xticks(range(X.shape[1]), X.columns[indices], rotation=45)
# plt.xlim([-1, X.shape[1]])
# plt.show()

Feature ranking:
1. feature 1 (0.041)
2. feature 0 (0.038)
3. feature 2 (0.035)
4. feature 3 (0.033)
5. feature 69 (0.030)
6. feature 79 (0.016)
7. feature 19 (0.014)
8. feature 24 (0.014)
9. feature 9 (0.013)
10. feature 54 (0.012)
11. feature 49 (0.012)
12. feature 4 (0.010)
13. feature 82 (0.010)
14. feature 44 (0.010)
15. feature 87 (0.009)
16. feature 94 (0.009)
17. feature 99 (0.009)
18. feature 81 (0.009)
19. feature 92 (0.009)
20. feature 91 (0.009)
21. feature 107 (0.009)
22. feature 39 (0.009)
23. feature 97 (0.009)
24. feature 57 (0.009)
25. feature 27 (0.009)
26. feature 89 (0.009)
27. feature 96 (0.009)
28. feature 106 (0.009)
29. feature 104 (0.009)
30. feature 56 (0.009)
31. feature 71 (0.009)
32. feature 86 (0.009)
33. feature 25 (0.009)
34. feature 52 (0.008)
35. feature 17 (0.008)
36. feature 26 (0.008)
37. feature 37 (0.008)
38. feature 95 (0.008)
39. feature 77 (0.008)
40. feature 80 (0.008)
41. feature 51 (0.008)
42. feature 85 (0.008)
43. feature 34 (0.008)
44. fe