## Test Out Different models - XGBOOST and Logistic Regression

In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss,precision_recall_fscore_support

#### Read in train and validataion set

In [11]:
train = pd.read_csv("../data/train.csv")
val = pd.read_csv("../data/val.csv")

In [12]:
train.head()

Unnamed: 0.1,Unnamed: 0,Street,From,To,Street.Length,any_spot,lat,lng,street_cluster,Minute,...,GMP_VACANT_TIME_pct,GMP_UNKNOWN_TIME_pct,COMM_OCCUPIED_TIME_pct,COMM_VACANT_TIME_pct,COMM_UNKNOWN_TIME_pct,TOTAL_TIME_spots,OP_TIME_spots,NONOP_TIME_spots,GMP_TIME_spots,COMM_TIME_spots
0,0,15,1,1,179.13297,1,37.750648,-122.418316,2,19,...,0.233428,0.005768,0.490227,0.500646,0.009127,27,20,7,16,3
1,1,18,16,42,52.74021,0,37.78404,-122.419381,6,42,...,0.0,0.0,0.0,0.0,0.0,10,0,10,0,0
2,2,24,8,21,52.405315,0,37.788499,-122.421892,6,38,...,0.0,0.0,0.0,0.0,0.0,13,0,13,0,0
3,3,24,13,46,52.191193,0,37.787084,-122.421757,6,38,...,0.0,0.0,0.0,0.0,0.0,13,0,13,0,0
4,4,12,10,46,52.32425,0,37.786592,-122.418214,6,20,...,0.0,0.0,0.0,0.0,0.0,14,0,14,0,0


In [13]:
train=train.drop('Unnamed: 0', axis =1)
val=val.drop('Unnamed: 0', axis =1)

In [14]:
def split_response(df, response):
    y = df[response].values
    df.drop([response],axis=1,inplace=True)
    return df,y

In [15]:
X_train,y_train = split_response(train,'any_spot')
X_val,y_val = split_response(val,'any_spot')

In [16]:
def f05_score_hard(labels, preds):
    tp = np.sum((labels==preds) & (labels==1))
    tn = np.sum((labels==preds) & (labels==0))
    fp = np.sum(preds==1)-tp
    fn = np.sum(preds==0)-tn
    #print(tp, tn, fp, fn)
    p = tp*1.0/(tp+fp)
    r = tp*1.0/(tp+fn)
    print (p,r)
    score = 1.25*p*r/(0.25*p+r)
    return score

### XGBOOST

In [17]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval= xgb.DMatrix(X_val, label=y_val)

In [75]:
params = {
    'eta': 0.08,
    'max_depth':8,
    'min_child_weight': 1,
    'subsample': 0.8,
    'objective':'binary:logistic',
    'colsample_bytree':0.4,
    "eval_metric" : ['logloss','auc']
}

In [76]:
watchlist = [(dtrain, 'train'), (dval, 'val')]
bst_bow = xgb.train(params, dtrain, 1000, watchlist, early_stopping_rounds=150, verbose_eval=50)
clf = xgb.train(params, dtrain, num_boost_round=bst_bow.best_ntree_limit-1)

out = clf.predict(dval)
pred = out>0.5
f05_score_hard(y_val, pred)

[0]	train-logloss:0.657444	train-auc:0.873581	val-logloss:0.671838	val-auc:0.691476
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 150 rounds.
[50]	train-logloss:0.204736	train-auc:0.998912	val-logloss:0.599205	val-auc:0.699704
[100]	train-logloss:0.118193	train-auc:0.999971	val-logloss:0.652036	val-auc:0.698595
[150]	train-logloss:0.079078	train-auc:1	val-logloss:0.695399	val-auc:0.691383
Stopping. Best iteration:
[20]	train-logloss:0.348283	train-auc:0.985575	val-logloss:0.564481	val-auc:0.726609

0.566666666667 0.326923076923


0.49418604651162784

In [253]:
clf = xgb.train(params, dtrain, num_boost_round=bst_bow.best_ntree_limit-1)

In [259]:
out = clf.predict(dval)
pred = out>0.5

In [261]:
log_loss(y_val, pred)

9.7417574189126395

In [267]:
f05_score_hard(y_val, pred)

0.642857142857 0.346153846154


0.54878048780487809

## Logistic Regression

In [268]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [269]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [270]:
X_train_n = scaler.transform(X_train)

In [271]:
X_val_n=scaler.transform(X_val)

In [307]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C=1e8)

In [308]:
log_reg.fit(X_train_n,y_train)

LogisticRegression(C=100000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [309]:
lrpredict = log_reg.predict(X_val_n)

In [310]:
log_loss(y_val, lrpredict)

11.513017726212173

In [311]:
f05_score_hard(y_val, lrpredict)

0.5 0.346153846154


0.45918367346938777