In [21]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

with open("helper_functions.py") as f:
    code = compile(f.read(), "helper_functions.py", 'exec')
    exec(code)

In [22]:
rng = np.random.RandomState(0)

print ('Reading in train data..')
train = pd.read_csv('train.csv')
train['type'] = 'train'

print ('Reading in test data..')
test = pd.read_csv('test.csv')
test['type'] = 'test'
test['OutcomeSubtype'] = ''
test['OutcomeType'] = ''

df = data_import(train, test)

print ('Running data preparation for train dataset')
X_train, y_train, le_train, X_train_cols = prep_data(df, 'train')

print ('Running data preparation for test dataset')
X_test, y_test, le_test, X_test_cols = prep_data(df, 'test')

col_check(X_train_cols, X_test_cols)

Reading in train data..
Reading in test data..
Running feature extraction process..
Dropping unused variables..
Running data preparation for train dataset
Encoding labels of the outcome variable..
Using one hot encoding for predictor variables..
Running data preparation for test dataset
Encoding labels of the outcome variable..
Using one hot encoding for predictor variables..
Columns are the same!!


In [26]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)
print ('Model score is %r' % gbm.score(X_train, y_train))
predict_output(gbm, 'xgboost')

Model score is 0.6711437015975158
Predicting outcomes..
Correct number of rows
Saving to CSV..
Done!


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

gbm = xgb.XGBClassifier()
pipe = Pipeline(steps=[('gbm', gbm)])

n_estimators = [20, 50, 100, 200, 500]
max_depth = [3, 4, 5]
learning_rate = [0.1, 0.05, 0.01]
n_comp = list(range(1, X_train.shape[1] + 1))

estimator_gbm = GridSearchCV(pipe, dict(
                              gbm__n_estimators = n_estimators,
                              gbm__max_depth=max_depth,
                              gbm__learning_rate = learning_rate))

print ('Fitting the model to the training dataset..')
estimator_gbm.fit(X_train, y_train)

best_score = estimator_gbm.best_score_
best_est = estimator_gbm.best_estimator_

print ('Best score is %r for GBM' % best_score)
print ('best estimators are as follows \n %r' % best_est)

predict_output(estimator_gbm, 'xgboost_tuned')

Fitting the model to the training dataset..


Best score is 0.67125593924202176 for GBM
best estimators are as follows 
 Pipeline(steps=[('gbm', XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=200,
       nthread=-1, objective='multi:softprob', seed=0, silent=True,
       subsample=1))])


In [27]:
from sklearn.pipeline import Pipeline
from sklearn import linear_model, decomposition
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

logistic = linear_model.LogisticRegression()

pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

n_comp = list(range(1, X_train.shape[1] + 1))
Cs = np.logspace(-4, 4, 3)
penalty = ['l1', 'l2']

print ('Estimating pipeline of PCA and Logistic Regression')

estimator_lr = GridSearchCV(pipe,
                         dict(pca__n_components=n_comp,
                              logistic__C=Cs, logistic__penalty=penalty), n_jobs=-1)

print ('Fitting estimator on train dataset..')
estimator_lr.fit(X_train, y_train)

best_score = estimator_lr.best_score_
best_est = estimator_lr.best_estimator_

print ('Best score is %r for Logistic Regression' % best_score)
print ('best estimators are as follows \n %r' % best_est)

predict_output(estimator_lr, 'pca_logit')

Estimating pipeline of PCA and Logistic Regression
Fitting estimator on train dataset..
Best results for Logistic Regression
0.630064723708
Pipeline(steps=[('pca', PCA(copy=True, n_components=44, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [None]:
rf = RandomForestClassifier()
pipe = Pipeline(steps=[('pca', pca), ('rf', rf)])

n_estimators = [20, 50, 100]
max_depth = [3, None]
min_samples_split = [1, 3, 10]
min_samples_leaf = [1, 3, 10]
bootstrap = [True, False]
criterion = ["gini", "entropy"]

n_comp = list(range(1, X_train.shape[1] + 1))

estimator_rf = GridSearchCV(pipe,
                         dict(pca__n_components=n_comp,
                              rf__n_estimators = n_estimators,
                              rf__max_depth=max_depth,
                              rf__min_samples_split = min_samples_split,
                              rf__min_samples_leaf = min_samples_leaf,
                              rf__bootstrap = bootstrap,
                              rf__criterion = criterion), n_jobs = -1)

print ('Fitting the model to the training dataset..')
estimator_rf.fit(X_train, y_train)

best_score = estimator_rf.best_score_
best_est = estimator_rf.best_estimator_

print ('Best results for Random Forest')
print (best_score)
print (best_est)

Fitting the model to the training dataset..
