In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import optuna
import  os
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from utils import Optuna_for_LGB
from preprocess import convert_notation, region_encoding, grouping_by_region, group_to_feature, require_median_dict
PATH = '../data/'
TRAIN = False

In [2]:
train_df = pd.read_csv(PATH + 'train.csv')
test_df = pd.read_csv(PATH + 'test.csv')
train_df = convert_notation(train_df)
test_df = convert_notation(test_df)
train_df, test_df, le = region_encoding(train_df, test_df)
columns = list(test_df.columns)
columns.remove('index')
columns.remove('region')
group = grouping_by_region(train_df, test_df, columns.copy())
na_train = group_to_feature(train_df[test_df.columns].copy(), group, columns.copy())
na_test = group_to_feature(test_df.copy(), group, columns.copy())
median_dict = require_median_dict(na_test, test_df)
print(median_dict.keys())
print(train_df.shape, test_df.shape)
train_df.head()

dict_keys(['acousticness', 'positiveness', 'danceability', 'energy', 'liveness', 'speechiness', 'instrumentalness', 'popularity_mean', 'popularity_std', 'popularity_kurtosis', 'popularity_skew', 'duration_ms_mean', 'duration_ms_std', 'duration_ms_kurtosis', 'duration_ms_skew', 'acousticness_mean', 'acousticness_std', 'acousticness_kurtosis', 'acousticness_skew', 'positiveness_mean', 'positiveness_std', 'positiveness_kurtosis', 'positiveness_skew', 'danceability_mean', 'danceability_std', 'danceability_kurtosis', 'danceability_skew', 'loudness_mean', 'loudness_std', 'loudness_kurtosis', 'loudness_skew', 'energy_mean', 'energy_std', 'energy_kurtosis', 'energy_skew', 'liveness_mean', 'liveness_std', 'liveness_kurtosis', 'liveness_skew', 'speechiness_mean', 'speechiness_std', 'speechiness_kurtosis', 'speechiness_skew', 'instrumentalness_mean', 'instrumentalness_std', 'instrumentalness_kurtosis', 'instrumentalness_skew', 'tempo_mean', 'tempo_std', 'tempo_kurtosis', 'tempo_skew', 'tempo_min_

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,tempo_min,tempo_max
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,31,7,121,152
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,23,8,153,176
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,12,4,64,76
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,15,2,177,192
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,23,19,97,120


In [3]:
objective_args = {
    'num_leaves': {
        'type': 'int',
        'suggest_args': {
            'name': 'num_leaves',
            'low': 2,
            'high': 128,
        }
    },
    'max_depth': {
        'type': 'int',
        'suggest_args': {
            'name': 'max_depth',
            'low': 3,
            'high': 8,
        }
    },
    'min_data_in_leaf': {
        'type': 'int',
        'suggest_args': {
            'name': 'min_data_in_leaf',
            'low': 5,
            'high': 90,
        }
    },
    'n_estimators': {
        'type': 'int',
        'suggest_args': {
            'name': 'n_estimators',
            'low': 100,
            'high': 1000,
        }
    },
    'learning_rate': {
        'type': 'uniform',
        'suggest_args': {
            'name': 'learning_rate',
            'low': 0.0001,
            'high': 0.1
        }
    },
    'bagging_fraction': {
        'type': 'uniform',
        'suggest_args': {
            'name': 'bagging_fraction',
            'low': 0.0001,
            'high': 1.0,
        }
    },
    'feature_fraction': {
        'type': 'uniform',
        'suggest_args': {
            'name': 'feature_fraction',
            'low': 0.0001,
            'high': 1.0,
        }
    },
    'random_state': {
        'type': 'default',
        'value': 0
    },
    'objective': {
        'type': 'default',
        'value': 'cross_entropy'
    },
    'num_class': {
        'type': 'default',
        'value': 11
    },
    # 'class_weight': {
    #     'type': 'default',
    #     'value': class_weight
    # }
    
}
def evaluate_macroF1_lgb(true, pred):  
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred = pred.reshape(len(np.unique(true)), -1).argmax(axis=0)
    f1 = f1_score(true, pred, average='macro')
    return ('macroF1', f1, True) 

pipeline_args = {
    'fit_attr': 'fit',
    'pred_attr': 'predict',
    'fit_args': {
        # 'X': x_train,
        # 'y': y_train,
        # 'eval_set': (x_test, y_test),
        'eval_metric': evaluate_macroF1_lgb,
        'eval_names': ['validation'],
        'early_stopping_rounds': 50,
        'verbose': -1,
        # 'feature_name': columns
        'categorical_feature': ['region']
    },
    # 'pred_args': {'X': x_test},
    'metric': lambda true, pred: f1_score(true, pred, average='macro'),
    # 'metric_args': {'true': y_test},
    'model': lgb.LGBMClassifier,
}

In [4]:
%%capture
columns = list(test_df.columns)
columns.remove('index')
target_columns = columns.copy()
target_columns.remove('region')
cv_num = 10
kf = StratifiedKFold(n_splits=cv_num, shuffle=True, random_state=0)
name = 'lgb_smote'
x, y = train_df[columns], train_df.genre
ofl = Optuna_for_LGB()
optuna.logging.disable_default_handler()
score = list()
for cv, (train_valid_idx, test_idx) in enumerate(kf.split(np.zeros(train_df.shape[0]), y)):
    x_tv, x_test = x.iloc[train_valid_idx], x.iloc[test_idx]
    y_tv, y_test = y.iloc[train_valid_idx], y.iloc[test_idx]
    x_test = group_to_feature(x_test, group, target_columns)
    for col, median in median_dict.items():
        x_test[col] = x_test[col].fillna(value=median)

    cv_score = list()
    for valid_cv, (train_idx, valid_idx) in enumerate(kf.split(np.zeros(x_tv.shape[0]), y_tv)):
        x_train, x_valid = x_tv.iloc[train_idx], x_tv.iloc[valid_idx]
        y_train, y_valid = y_tv.iloc[train_idx], y_tv.iloc[valid_idx]

        x_train = group_to_feature(x_train, group, target_columns)
        x_valid = group_to_feature(x_valid, group, target_columns)

        x_train_fill = x_train.copy()
        for col, median in median_dict.items():
            x_train_fill[col] = x_train_fill[col].fillna(value=median)
            x_valid[col] = x_valid[col].fillna(value=median)
        x_train = x_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        x_train_fill = x_train_fill.reset_index(drop=True)

        smote = SMOTE(random_state=0, n_jobs=-1, k_neighbors=5)
        ss_col = list(x_train.columns)
        ss_col.remove('region')
        ss = StandardScaler()
        x_train_fill[ss_col] = ss.fit_transform(x_train_fill[ss_col])
        x_train_fill = pd.get_dummies(x_train_fill, columns=['region'])
        region_col = list(x_train_fill.columns)[-20:]
        x_train_fill[region_col] /= 100.0
        x_sample, y_sample = smote.fit_resample(x_train_fill.copy(), y_train.copy())
        x_sample = pd.DataFrame(x_sample, columns=x_train_fill.columns)
        x_sample[ss_col] = ss.inverse_transform(x_sample[ss_col])
        region_onehot = x_sample[region_col].copy()
        x_sample = x_sample.drop(columns=region_col)
        x_sample['region'] = region_onehot.idxmax(1).apply(lambda x: int(x.split('_')[1]))

        x_train = x_sample
        y_train = y_sample

        pipeline_args['fit_args']['X'] = x_train
        pipeline_args['fit_args']['y'] = y_train
        pipeline_args['fit_args']['eval_set'] = (x_valid, y_valid)
        pipeline_args['fit_args']['feature_name'] = list(x_train.columns)
        pipeline_args['pred_args'] = {'X': x_valid}
        pipeline_args['metric_args'] = {'true': y_valid}
        if TRAIN:
            params = ofl.parameter_tuning(pipeline_args, objective_args, 1000, -1, 0)

            best_iteration = params.pop('best_iteration_')
            model = lgb.LGBMClassifier(**params)
            model.fit(**pipeline_args['fit_args'])
            y_pred = model.predict_proba(x_test, model.best_iteration_)
            model.booster_.save_model('../model/{0}_{1}_{2}.txt'.format(name, cv, valid_cv))
        else:
            model = lgb.Booster(model_file='../model/{0}_{1}_{2}.txt'.format(name, cv, valid_cv))
            y_pred = model.predict(x_test).astype(np.float64)

        cv_score.append(f1_score(y_test, np.argmax(y_pred, axis=1), average='macro'))
    score.append(cv_score)

In [5]:
score, np.mean(score, axis=1), np.mean(score)

([[0.23178716141032127,
   0.24224813674147294,
   0.24314262571587336,
   0.19968816575989143,
   0.23251148522070586,
   0.19936690704940171,
   0.2851503693737444,
   0.2479473854600289,
   0.19917251340304382,
   0.24212268289101527],
  [0.20890733024173194,
   0.30044165880004176,
   0.24419984062638792,
   0.2905669956110037,
   0.2334111571263274,
   0.20241505281962208,
   0.26913780406465343,
   0.32695618288978806,
   0.2250174586437306,
   0.2655598114486518],
  [0.22286475413542683,
   0.23744992761852626,
   0.16968261389018754,
   0.19300923680161564,
   0.23495733500011576,
   0.13331319464234825,
   0.24259979234434914,
   0.20702900470084507,
   0.24360274808418214,
   0.19530521843166707],
  [0.24278274075510134,
   0.1940739697601543,
   0.31095671155597204,
   0.2069774754609831,
   0.2552284169620322,
   0.1528876608390085,
   0.2582963729557275,
   0.20947382720638535,
   0.1710589085897107,
   0.25961905345082914],
  [0.2306482761061555,
   0.2523567982055987,
  

In [6]:
sub_df = pd.read_csv(PATH + 'sample_submit.csv', header=None, names=['ID', 'Pred'])
prediction = np.zeros((test_df.shape[0], 11), dtype=np.float64)
x = test_df[columns]
for i in range(cv_num):
    for j in range(cv_num):
        model = lgb.Booster(model_file='../model/{0}_{1}_{2}.txt'.format(name, i, j))
        pred = model.predict(x).astype(np.float64)
        prediction += pred
prediction /= 100.0
sub_df.Pred = np.argmax(prediction, axis=1)
sub_df.head()

LightGBMError: The number of features in data (14) is not the same as it was in training data (118).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.