In [157]:
import joblib
import pandas as pd                
import glob, os

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              ExtraTreesClassifier,
                              StackingClassifier)

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (StratifiedKFold,
                                    RandomizedSearchCV,
                                    cross_val_score,
                                    train_test_split)

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval

from myhelper import (extract_column_names,
                      data_prep,
                      sampling,
                      ColumnSelector)

from kaggle.api.kaggle_api_extended import KaggleApi

from myconfig import (TARGET_COL,
                      SAMPLE_SIZE,
                      SEED)  

import warnings
warnings.filterwarnings('ignore')

## helper

In [93]:
def generate_pipeline(model, col_filter, col_to_scale, power_tf=False, scaler_tf=False, **kwargs):
    
    steps = []
    transformers = []

    # Adding dataset column selector
    steps.append(('col_selector', ColumnSelector(col_filter)))
    # adding power transformer
    if power_tf and len(col_to_scale)>0:
        transformers.append(('power_tf', PowerTransformer(method='yeo-johnson', standardize=False), col_to_scale))
    # adding scaler transformer
    if scaler_tf and len(col_to_scale)>0:
        transformers.append(('data_tf', StandardScaler(), col_to_scale))
    # adding transformer to pipeline definition
    if len(transformers)>0:
        steps.append(('data_tf', ColumnTransformer(transformers=transformers, remainder='passthrough')))
    # adding model
    steps.append(('clf', model))
    # generate pipeline
    pipeline = Pipeline(steps)

    return pipeline

def process_pipeline(X, y, pipeline, space, max_evals):
    
    def objective(params):
        # define stratified cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
        # set pipeline params
        pipeline.set_params(**params)
        # score pipeline
        score = -cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv).mean()
        # return result
        return {'loss': score, 'status': STATUS_OK}
    
    # hyperparametr tuning
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals)

    # return 
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    # set pipeline params
    pipeline.set_params(**space_eval(space, best))
    # score pipeline
    score = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv)
    res = {'best': space_eval(space, best), 'mean': score.mean(), 'std': score.std()}
    print(res)
    return res

    
def process_all_pipelines(X, y, config, max_evals=30):

    results = {}
    for p_name, p_config in config.items():
        # output message
        print('\nprocessing pipeline {}...'.format(p_name))
        # defining columns to scale
        cols = extract_column_names(X, p_config['col_filter'])
        col_to_scale = extract_column_names(X[cols], '^(?!{})'.format('fe2__'))
        # generate pipeline
        pipeline = generate_pipeline(col_to_scale=col_to_scale, **p_config)
        # hyperparameter tuning
        results[p_name] = process_pipeline(X, y, pipeline, p_config['space'], max_evals=max_evals)
    return results

def stacking_predictions(X, y, X_test, pipeline_config, param_config):

    estimators = []
    for p_name, p_config in pipeline_config.items():
        # defining columns to scale
        cols = extract_column_names(X, p_config['col_filter'])
        col_to_scale = extract_column_names(X[cols], '^(?!{})'.format('fe2__'))
        # generate pipeline
        pipeline = generate_pipeline(col_to_scale=col_to_scale, **p_config)
        pipeline.set_params(**param_config[p_name]['best'])
        estimator = ('{}'.format(p_name), pipeline)
        estimators.append(estimator)
    
    # stacking with LogisticRegression as the meta learner
    stacker = StackingClassifier(estimators=estimators,
                             final_estimator=LogisticRegression())
        
    # score meta-learner
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
#     scores = cross_val_score(stacker, X, y, scoring='roc_auc', cv=cv)
#     print('mean: {}, std: {}'.format(scores.mean(), scores.std()))
    
    # return predictions
    preds = stacker.fit(X, y).predict_proba(X_test)[:, 1]
    cols = ['Id', 'Choice']
    X_test['Choice'] = preds
    X_test['Id'] = X_test.index+1

    return X_test.loc[:, cols]


## pipeline generation

### pipeline configuration

In [161]:
def load_pipeline_config():
    config = {}
    #Define the hyperparameter configuration space for lgbm
    space = {
            'clf__n_estimators': hp.choice('clf__n_estimators', range(10, 101)),
            'clf__max_depth': hp.choice('clf__max_depth', range(5, 51)),
            'clf__min_child_weight': hp.quniform('clf__min_child_weight', 0.0 , 0.2, 0.01),
            'clf__learning_rate': hp.quniform('clf__learning_rate', 0.005, 0.3, 0.01),
            'clf__subsample': hp.quniform('clf__subsample', 0.1, 1.0, 0.05),
            'clf__colsample_bylevel': hp.quniform('clf__colsample_bylevel', 0.1, 1.0, 0.05),
            'clf__colsample_bytree': hp.quniform('clf__colsample_bytree', 0.1, 1.0, 0.05)
            }
    config['lgbm_1'] = {'model': LGBMClassifier(),
                      'col_filter': '^fe0__',
                      'power_tf': True,
                      'scaler_tf': True,
                      'space': space}

    config['lgbm_2'] = {'model': LGBMClassifier(),
                      'col_filter': '^(fe0__|fe2__)',
                      'power_tf': True,
                      'scaler_tf': True,
                      'space': space}
    config['lgbm_4'] = {'model': LGBMClassifier(),
                      'col_filter': '^fe0__',
                      'power_tf': False,
                      'scaler_tf': False,
                      'space': space}

    config['lgbm_5'] = {'model': LGBMClassifier(),
                      'col_filter': '^(fe0__|fe2__)',
                      'power_tf': False,
                      'scaler_tf': False,
                      'space': space}

    # Define the hyperparameter configuration space for xgboost
    space = {
            'clf__n_estimators': hp.choice('clf__n_estimators', range(10, 101)),
            'clf__max_depth': hp.choice('clf__max_depth', range(5, 51)),
            'clf__min_child_weight': hp.quniform('clf__min_child_weight', 0.0 , 0.2, 0.01),
            'clf__learning_rate': hp.quniform('clf__learning_rate', 0.005, 0.3, 0.01),
            'clf__subsample': hp.quniform('clf__subsample', 0.1, 1.0, 0.05),
            'clf__colsample_bylevel': hp.quniform('clf__colsample_bylevel', 0.1, 1.0, 0.05),
            'clf__colsample_bytree': hp.quniform('clf__colsample_bytree', 0.1, 1.0, 0.05),
            'clf__gamma': hp.quniform('clf__gamma', 0., 0.5, 0.01),
            }
    config['xgboost_1'] = {'model': XGBClassifier(),
                      'col_filter': '^fe0__',
                      'power_tf': True,
                      'scaler_tf': True,
                      'space': space}

    config['xgboost_2'] = {'model': LGBMClassifier(),
                      'col_filter': '^(fe0__|fe2__)',
                      'power_tf': True,
                      'scaler_tf': True,
                      'space': space}
    config['xgboost_3'] = {'model': XGBClassifier(),
                      'col_filter': '^fe0__',
                      'power_tf': False,
                      'scaler_tf': False,
                      'space': space}

    config['xgboost_4'] = {'model': LGBMClassifier(),
                      'col_filter': '^(fe0__|fe2__)',
                      'power_tf': False,
                      'scaler_tf': False,
                      'space': space}

    # Define the hyperparameter configuration space for RF
#     space = {
#             'clf__n_estimators': hp.choice('clf__n_estimators', range(20, 205, 5)),
#             'clf__max_depth': hp.choice('clf__max_depth', range(5, 51)),
#             'clf__max_features' : hp.choice('clf__max_features', ['sqrt','log2',0.2,0.5,0.8]),
#             'clf__criterion' : hp.choice('clf__criterion', ['gini','entropy']),    
#             'clf__min_samples_leaf': hp.choice('clf__min_samples_leaf', range(1, 10)),
#             'clf__min_samples_split': hp.choice('clf__min_samples_split', range(5, 20, 5))
#             }
#     config['RF_1'] = {'model': RandomForestClassifier(),
#                       'col_filter': '^fe0__',
#                       'power_tf': True,
#                       'scaler_tf': True,
#                       'space': space}

#     config['RF_2'] = {'model': RandomForestClassifier(),
#                       'col_filter': '^(fe0__|fe2__)',
#                       'power_tf': True,
#                       'scaler_tf': True,
#                       'space': space}

    # Define the hyperparameter configuration space for Adaboost   
#     space = {
#             'clf__n_estimators': hp.choice('clf__n_estimators', range(20, 205, 5)),
#             'clf__learning_rate': hp.quniform('clf__learning_rate', 0.005, 0.3, 0.01),
#             }
#     config['Adaboost_1'] = {'model': AdaBoostClassifier(),
#                       'col_filter': '^fe0__',
#                       'power_tf': True,
#                       'scaler_tf': True,
#                       'space': space}

#     config['Adaboost_2'] = {'model': AdaBoostClassifier(),
#                       'col_filter': '^(fe0__|fe2__)',
#                       'power_tf': True,
#                       'scaler_tf': True,
#                       'space': space}


#     # Define the hyperparameter configuration space for extratree
#     space = {
#             'clf__n_estimators': hp.choice('clf__n_estimators', range(20, 205, 5)),
#             'clf__max_depth': hp.choice('clf__max_depth', range(5, 51)),
#             'clf__max_features' : hp.choice('clf__max_features', ['sqrt','log2',0.2,0.5,0.8]),
#             'clf__criterion' : hp.choice('clf__criterion', ['gini','entropy']),    
#             'clf__min_samples_leaf': hp.choice('clf__min_samples_leaf', range(1, 10)),
#             'clf__min_samples_split': hp.choice('clf__min_samples_split', range(5, 20, 5))
#             }
#     config['extratree_1'] = {'model': ExtraTreesClassifier(),
#                       'col_filter': '^fe0__',
#                       'power_tf': True,
#                       'scaler_tf': True,
#                       'space': space}

#     config['extratree_2'] = {'model': ExtraTreesClassifier(),
#                       'col_filter': '^(fe0__|fe2__)',
#                       'power_tf': True,
#                       'scaler_tf': True,
#                       'space': space}

    return config

### processing all pipelines

In [162]:
test_mode = 0

# data prep
df = pd.read_csv('data/train.csv').pipe(data_prep)
X, y = df.drop(columns=TARGET_COL), df[TARGET_COL]
if test_mode:
    X, y = sampling(X,y,SAMPLE_SIZE, SEED)

config = load_pipeline_config()
res = process_all_pipelines(X, y, config, max_evals=50)

# saving dict to file
joblib.dump(res, 'data/best-hyperparameters')




processing pipeline lgbm_1...
100%|██████████| 50/50 [01:37<00:00,  1.95s/trial, best loss: -0.8729717816649156]
{'best': {'clf__colsample_bylevel': 0.4, 'clf__colsample_bytree': 0.15000000000000002, 'clf__learning_rate': 0.09, 'clf__max_depth': 13, 'clf__min_child_weight': 0.03, 'clf__n_estimators': 63, 'clf__subsample': 0.65}, 'mean': 0.8729717816649156, 'std': 0.008318871360400383}

processing pipeline lgbm_2...
100%|██████████| 50/50 [01:39<00:00,  2.00s/trial, best loss: -0.872760121840288] 
{'best': {'clf__colsample_bylevel': 1.0, 'clf__colsample_bytree': 0.2, 'clf__learning_rate': 0.1, 'clf__max_depth': 23, 'clf__min_child_weight': 0.0, 'clf__n_estimators': 52, 'clf__subsample': 0.35000000000000003}, 'mean': 0.872760121840288, 'std': 0.007306485761643223}

processing pipeline lgbm_4...
100%|██████████| 50/50 [00:25<00:00,  1.92trial/s, best loss: -0.874581636672116] 
{'best': {'clf__colsample_bylevel': 0.45, 'clf__colsample_bytree': 0.30000000000000004, 'clf__learning_rate': 0.

['data/best-hyperparameters']

## scoring pipelines individually

In [166]:
p_names = ['lgbm_1', 'lgbm_2', 'lgbm_5', 'lgbm_4',
           'xgboost_1', 'xgboost_2', 'xgboost_3', 'xgboost_4']
# data_tf = False

for p_name in p_names:
    
    # loading data + data prep
    df = pd.read_csv('data/train.csv').pipe(data_prep)
    X_test = pd.read_csv('data/test.csv').pipe(data_prep)
    X, y = df.drop(columns=TARGET_COL), df[TARGET_COL]

    # load pipeline config
    pipeline_config = load_pipeline_config()
    # load best parameters
    param_config = joblib.load('data/best-hyperparameters')

    # filter columns
    regex = pipeline_config[p_name]['col_filter']
    cols = extract_column_names(X, regex)
    data_tf = pipeline_config[p_name]['scaler_tf']
    X = X[cols]
    X_test = X_test[cols]

    # add transformer to pipeline definition
    steps = []
    if data_tf:
        data_tf_str = 'with-tf'
        scale_cols = extract_column_names(X, '^(?!{})'.format('fe2__'))
        steps.append(('data_tf',
                      ColumnTransformer(transformers=[
                        ('power_tf', PowerTransformer(method='yeo-johnson', standardize=False), scale_cols),
                        ('scaler_tf', StandardScaler(), scale_cols)
                      ], remainder='passthrough')))

    # add model to pipeline definition
    steps.append(('clf', pipeline_config[p_name]['model']))

    # create pipeline
    pipeline = Pipeline(steps)
    pipeline.set_params(**param_config[p_name]['best'])

    # generate preds
    preds = pipeline.fit(X, y).predict_proba(X_test)[:, 1]
    cols = ['Id', 'Choice']
    X_test['Choice'] = preds
    X_test['Id'] = X_test.index+1
    X_test.loc[:, cols].to_csv('data/submissions/round2- preds-{}-{}.csv'.format(p_name, 'with-tf' if data_tf else 'no-tf'), index=False)

print('done')

done


In [167]:
# kaggle authentication
api = KaggleApi()
api.authenticate()

# submist files in submission folder
for file_path in glob.glob("data/submissions/*.csv"):
    print('submitting {}...'.format(file_path))
    api.competition_submit(file_path,
                           'API Submission',
                           'predict-who-is-more-influential-in-a-social-network')
print('done')

submitting data/submissions/round2- preds-xgboost_3-no-tf.csv...


100%|██████████| 89.3k/89.3k [00:08<00:00, 10.5kB/s]


submitting data/submissions/round2- preds-lgbm_2-with-tf.csv...


100%|██████████| 140k/140k [00:07<00:00, 19.6kB/s] 


submitting data/submissions/round2- preds-xgboost_4-no-tf.csv...


100%|██████████| 140k/140k [00:06<00:00, 21.1kB/s] 


submitting data/submissions/round2- preds-lgbm_1-with-tf.csv...


100%|██████████| 140k/140k [00:06<00:00, 22.3kB/s] 


submitting data/submissions/round2- preds-xgboost_2-with-tf.csv...


100%|██████████| 140k/140k [00:05<00:00, 25.3kB/s] 


submitting data/submissions/round2- preds-lgbm_4-no-tf.csv...


100%|██████████| 140k/140k [00:08<00:00, 17.3kB/s] 


submitting data/submissions/round2- preds-lgbm_5-no-tf.csv...


100%|██████████| 140k/140k [00:06<00:00, 22.1kB/s] 


submitting data/submissions/round2- preds-xgboost_1-with-tf.csv...


100%|██████████| 90.1k/90.1k [00:05<00:00, 16.1kB/s]


done


## stacking only with best pipelines


In [168]:
test_mode = 0

df = pd.read_csv('data/train.csv').pipe(data_prep)
X, y = df.drop(columns=TARGET_COL), df[TARGET_COL]
if test_mode:
    X, y = sampling(X,y,SAMPLE_SIZE, SEED)
X_test = pd.read_csv('data/test.csv').pipe(data_prep)

In [173]:
# load pipeline config
pipeline_config = load_pipeline_config()
# restrict to best pipelines
# best_pipelines = ['lgbm_1', 'lgbm_2', 'xgboost_1', 'xgboost_2', 'extratree_1']
# best_pipelines = ['lgbm_2', 'xgboost_2']

best_pipelines = ['lgbm_1', 'lgbm_2', 'lgbm_5', 'lgbm_4',
                  'xgboost_1', 'xgboost_2', 'xgboost_3', 'xgboost_4']

best_pipelines = dict(filter(lambda x: x[0] in best_pipelines, pipeline_config.items()))

# load best parameters
param_config = joblib.load('data/best-hyperparameters')

res = stacking_predictions(X, y, X_test, best_pipelines, param_config)
res.to_csv('data/preds-stacking-with-best-pipelines.csv', index=False)


## kaggle submissions

In [174]:
# kaggle authentication
api = KaggleApi()
api.authenticate()

# submist files in submission folder
api.competition_submit('data/preds-stacking-with-best-pipelines.csv',
                       'API Submission',
                       'predict-who-is-more-influential-in-a-social-network')
print('done')

100%|██████████| 140k/140k [00:07<00:00, 19.4kB/s] 


done


## final score

&emsp;. private Score: <b>0.87103</b> (16th place)<br>
&emsp;. public Score: <b>0.87079</b> (10th place)<br><br>
