In [17]:
# import the libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [18]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [19]:
# load data
df = pd.read_csv('../data/commits_transformed.csv', 
                 index_col='date', 
                 parse_dates=['date'],
                 infer_datetime_format=True)

In [20]:
df = df.drop("total_deletions", axis=1)
df = df.drop("total_additions", axis=1)
df = df.drop("total", axis=1)
df = df.drop("private", axis=1)
df = df.drop("fork", axis=1)
df = df.drop("size", axis=1)
df = df.drop("watchers_count", axis=1)
df = df.drop("has_issues", axis=1)
df = df.drop("has_downloads", axis=1)
df = df.drop("has_wiki", axis=1)
df = df.drop("forks_count", axis=1)
df = df.drop("open_issues_count", axis=1)
df = df.drop("forks", axis=1)
df = df.drop("open_issues", axis=1)
df = df.drop("watchers", axis=1)
df = df.drop("network_count", axis=1)
df = df.drop("type", axis=1)
df = df.drop("admin", axis=1)
df = df.drop("push", axis=1)
df = df.drop("pull", axis=1)
df = df.drop("total_files", axis=1)
df = df.drop("language_C", axis=1)
df = df.drop("language_C#", axis=1)
df = df.drop("language_C++", axis=1)
df = df.drop("language_CSS", axis=1)
df = df.drop("language_CoffeeScript", axis=1)
df = df.drop("language_Go", axis=1)
df = df.drop("language_Java", axis=1)
df = df.drop("language_JavaScript", axis=1)
df = df.drop("language_PHP", axis=1)
df = df.drop("language_Perl", axis=1)
df = df.drop("language_Python", axis=1)
df = df.drop("language_R", axis=1)
df = df.drop("language_Ruby", axis=1)
df = df.drop("language_Scala", axis=1)
df = df.drop("language_Shell", axis=1)
df = df.drop("language_TypeScript", axis=1)

In [21]:
# split data into X and y
array = df.values
X = array[:,0:37]
Y = array[:,37]

In [22]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [23]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

In [24]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = False, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,Y), verbose=3, random_state=1001 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X, Y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed: 40.4min finished



 Time taken: 0 hours 49 minutes and 35.02 seconds.


In [25]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([395.63352784, 709.96507748, 720.31436189, 560.28063019,
       540.21017281]), 'std_fit_time': array([ 0.93342821, 46.72552618, 44.58463382, 13.97091798, 23.90768405]), 'mean_score_time': array([ 5.0712498 , 11.9163363 , 11.64624182,  9.7215627 ,  5.76317461]), 'std_score_time': array([0.09917679, 1.96742802, 1.05338611, 0.67214139, 0.8453187 ]), 'param_subsample': masked_array(data=[1.0, 0.6, 0.8, 1.0, 0.8],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[5, 1, 5, 5, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[3, 5, 5, 5, 4],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[5, 1.5, 1, 5, 1],
             mask=[False, False, False, False, False],
       fil

In [26]:
y_test = random_search.predict_proba(test)

NameError: name 'test' is not defined