In [None]:
import time
from joblib import Parallel, delayed

import pandas as pd
import numpy as np
import category_encoders as ce

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.utils import shuffle
from sklearn import preprocessing, pipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, balanced_accuracy_score, f1_score

In [None]:
#Original dataset
data = pd.read_json(r'original-dataset.json', orient='split')

# Original dataset without tenders with partial collusion
data = pd.read_json(r'original-dataset-without-partials.json', orient='split')

cat_features = ['Site', 'Brazilian State']

In [None]:
# #Enriched dataset
# data = pd.read_json(r'enriched-dataset.json', orient='split')

# # Enriched dataset without tenders with partial collusion
# data = pd.read_json(r'enriched-dataset-without-partials.json', orient='split')

# cat_features = ['Site', 'Brazilian State', 'natureza_juridica']

In [None]:
data['Difference Bid/PTE'] = data['Bid_value'] - data['Pre-Tender Estimate (PTE)']

In [None]:
encoders = [
    ce.BinaryEncoder(cols = cat_features, handle_unknown = 'value', return_df = True),
    ce.QuantileEncoder(cols = cat_features, handle_unknown = 'value', return_df = True),
    ce.OneHotEncoder(cols = cat_features, handle_unknown = 'value', return_df = True),
    ce.RankHotEncoder(cols = cat_features, handle_unknown = 'value', return_df = True),
    ce.HashingEncoder(cols = cat_features, return_df = True),
    ce.CountEncoder(cols = cat_features, handle_unknown = 'value', return_df = True)
]

In [None]:
scalers = [
    preprocessing.StandardScaler(),
    preprocessing.MinMaxScaler(),
    preprocessing.MaxAbsScaler(),
    preprocessing.RobustScaler(quantile_range=(25,75)),
    preprocessing.PowerTransformer(method='yeo-johnson'),
    preprocessing.QuantileTransformer(output_distribution='uniform'),
    preprocessing.QuantileTransformer(output_distribution='normal'),
]

In [None]:
models = {
    ExtraTreesClassifier(): {
        'estimator__n_estimators': np.linspace(50, 300, num = 10, dtype = int),
        'estimator__max_features': ['auto', 'sqrt', 'log2', None],
        'estimator__criterion': ['gini', 'entropy', 'log_loss'],
        'estimator__min_samples_split': np.linspace(2, 20, num = 8, dtype = int),
        'encoder': encoders,
        'scaler': scalers
    },
    RandomForestClassifier(): {
        'estimator__n_estimators': np.linspace(50, 500, num = 10, dtype = int),
        'estimator__criterion': ['gini', 'entropy', 'log_loss'],
        'estimator__max_features': ['sqrt', 'log2', None],
        'estimator__min_samples_split': np.linspace(2, 20, num = 10, dtype = int),
        'encoder': encoders,
        'scaler': scalers
    },
    AdaBoostClassifier(): {
        'estimator__estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=6), DecisionTreeClassifier(max_depth=10)],
        'estimator__n_estimators': np.linspace(50, 500, num = 10, dtype = int),
        'estimator__learning_rate': np.linspace(0.01, 1.0, num = 10, dtype = float),
        'encoder': encoders,
        'scaler': scalers
    },
    GradientBoostingClassifier(): {
        'estimator__n_estimators': np.linspace(50, 200, num = 4, dtype = int),
        'estimator__learning_rate': np.linspace(0.01, 1.0, num = 10, dtype = float),
        'estimator__max_depth': np.linspace(3, 10, num = 4, dtype = int),
        'encoder': encoders,
        'scaler': scalers
    },
    MLPClassifier(): {
        'estimator__hidden_layer_sizes': [(256,), (384,), (512,), (640,), (768,), (1024,)],
        'estimator__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'estimator__solver': ['lbfgs', 'sgd', 'adam'],
        'estimator__max_iter': np.linspace(500, 1000, num = 15, dtype = int),
        'encoder': encoders,
        'scaler': scalers
    }
}

In [None]:
def model_fit_eval(df, model, param_grid):
    evaldf = pd.DataFrame([])
    
    for seed in range(1, 11):
    
        predictors = shuffle(data, random_state = 0).reset_index(drop=True)
        targets = predictors.pop('Collusive_competitor')

        groups = predictors.pop('Tender')

        model.random_state = seed
    
        steps = [
            ('encoder', ce.OneHotEncoder(cols = cat_features, handle_unknown = 'value', return_df = True)),
            ('scaler', preprocessing.StandardScaler()), 
            ('estimator', model)
        ]

        pipe = pipeline.Pipeline(steps)

        cv = GroupKFold(n_splits = 5)

        search = RandomizedSearchCV(
            pipe, 
            param_grid,
            cv = cv,
            n_iter = 50,
            scoring=['accuracy','balanced_accuracy', 'precision', 'recall', 'f1'],
            refit = 'balanced_accuracy', 
            random_state = seed,
            n_jobs = -2
        )

        results = search.fit(predictors, targets, groups = groups)

        rf = pd.DataFrame(results.cv_results_)

        rf['seed'] = seed
        rf['best_index'] = results.best_index_
        rf['model'] = model.__class__.__name__.replace('Classifier', '')

        evaldf = pd.concat([evaldf, rf], ignore_index=True)
        
    return evaldf

In [None]:
start_time = time.time()
result = Parallel(n_jobs=-2)(delayed(model_fit_eval)(data, model, models[model]) for model in models)
total = time.time() - start_time

In [None]:
total/60

In [None]:
data_eval_all = pd.DataFrame([])

In [None]:
for r in result:
    data_eval_all = pd.concat([data_eval_all, r], ignore_index=True)