## Test Out Different models - XGBOOST and Logistic Regression

In [257]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss,precision_recall_fscore_support

#### Read in train and validataion set

In [19]:
train = pd.read_csv("../data/train.csv")
val = pd.read_csv("../data/val.csv")

In [20]:
train=train.drop('Unnamed: 0', axis =1)
val=val.drop('Unnamed: 0', axis =1)

In [22]:
def split_response(df, response):
    y = df[response].values
    df.drop([response],axis=1,inplace=True)
    return df,y

In [23]:
X_train,y_train = split_response(train,'any_spot')
X_val,y_val = split_response(val,'any_spot')

In [266]:
def f05_score_hard(labels, preds):
    tp = np.sum((labels==preds) & (labels==1))
    tn = np.sum((labels==preds) & (labels==0))
    fp = np.sum(preds==1)-tp
    fn = np.sum(preds==0)-tn
    #print(tp, tn, fp, fn)
    p = tp*1.0/(tp+fp)
    r = tp*1.0/(tp+fn)
    print (p,r)
    score = 1.25*p*r/(0.25*p+r)
    return score

### XGBOOST

In [27]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval= xgb.DMatrix(X_val, label=y_val)

In [251]:
params = {
    'eta': 0.1,
    'max_depth':5,
    'min_child_weight': 1,
    'subsample': 0.8,
    'objective':'binary:logistic',
    'colsample_bytree':1,
    "eval_metric" : ['logloss']
}

In [252]:
watchlist = [(dtrain, 'train'), (dval, 'val')]
bst_bow = xgb.train(params, dtrain, 500, watchlist, early_stopping_rounds=150, verbose_eval=50)

[0]	train-logloss:0.660003	val-logloss:0.672735
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 150 rounds.
[50]	train-logloss:0.249943	val-logloss:0.58587
[100]	train-logloss:0.148886	val-logloss:0.648194
[150]	train-logloss:0.099555	val-logloss:0.701489
[200]	train-logloss:0.071842	val-logloss:0.748835
Stopping. Best iteration:
[51]	train-logloss:0.247511	val-logloss:0.584673



In [253]:
clf = xgb.train(params, dtrain, num_boost_round=bst_bow.best_ntree_limit-1)

In [259]:
out = clf.predict(dval)
pred = out>0.5

In [261]:
log_loss(y_val, pred)

9.7417574189126395

In [267]:
f05_score_hard(y_val, pred)

0.642857142857 0.346153846154


0.54878048780487809

## Logistic Regression

In [268]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [269]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [270]:
X_train_n = scaler.transform(X_train)

In [271]:
X_val_n=scaler.transform(X_val)

In [307]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C=1e8)

In [308]:
log_reg.fit(X_train_n,y_train)

LogisticRegression(C=100000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [309]:
lrpredict = log_reg.predict(X_val_n)

In [310]:
log_loss(y_val, lrpredict)

11.513017726212173

In [311]:
f05_score_hard(y_val, lrpredict)

0.5 0.346153846154


0.45918367346938777