In [1]:
import pandas as pd
import numpy as np
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from scipy import stats

In [2]:
# load data
df = pd.read_csv('../data/commits_transformed.csv', 
                 index_col='date', 
                 parse_dates=['date'],
                 infer_datetime_format=True)

In [3]:
# split data into X and y
array = df.values
X = array[:,0:74]
Y = array[:,74]

In [4]:
# split data into train and test sets
seed = 7
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [5]:
# fit model no training data
model = XGBClassifier(max_depth = 5, n_estimators=200)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [6]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [7]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 79.28%


In [8]:
# stratified k-fold cross validation evaluation of xgboost model
import xgboost
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# load data
df = pd.read_csv('../data/commits_transformed.csv', 
                 index_col='date', 
                 parse_dates=['date'],
                 infer_datetime_format=True)
# split data into X and y
array = df.values
X = array[:,0:74]
Y = array[:,74]
# CV model
model = xgboost.XGBClassifier(learning_rate=0.1, 
                              objective= 'binary:logistic', 
                              nthread=4, 
                              max_depth=5, 
                              n_estimators=500)
kfold = StratifiedKFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 46.04% (9.27%)
