In [23]:
import pandas as pd
import numpy as np

data = pd.read_csv("OutliersRemoved2.csv")
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bays
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier #stochastic gradient descent
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [30]:
import xgboost as xgb
from sklearn import cross_validation, metrics
from xgboost.sklearn import XGBClassifier
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4


In [11]:
# drop all non_numericals and year except for province
data = data.drop(data.columns[[0,1,3,4,5,6]],axis = 1)

In [12]:
# shuffle
def create_frame(d):
    df = pd.DataFrame.copy(d)
    df = shuffle(df)
    features = df.describe()
    # imputation by subbing with mean on copy of data
    for i in list(df):
        if(i != 'Province'):
            df[i] = df[i].fillna(features[i].loc["mean"])
    
    return df

In [15]:
# For this notebook we will first try to feature scale and then see if our logreg improves
df1 = create_frame(data)
X = df1.drop(['Province'], axis=1)
Y = df1['Province']
X = pd.DataFrame(data = MinMaxScaler().fit_transform(X), columns = X.columns)

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

In [17]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
print("logistic regression training accuracy: ",logreg.score(X_train,Y_train))
print("logistic regression testing accuracy: ",logreg.score(X_test,Y_test) )

logistic regression training accuracy:  0.44347146208
logistic regression testing accuracy:  0.418069087688


In [26]:
# improvement wasn't sufficient, moving on to tuning

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)
grid_search = GridSearchCV(cv=None,
             estimator=LogisticRegression(C=1.0, intercept_scaling=1,   
               dual=False, fit_intercept=True, penalty='l2', tol=0.0001),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})
grid_search.fit(X_train, Y_train)
print(grid_search.best_params_)


{'C': 1000}


In [27]:
# tuning also wasn't sufficient
logreg = LogisticRegression(C=1000)
logreg.fit(X_train, Y_train)
print("logistic regression training accuracy: ",logreg.score(X_train,Y_train))
print("logistic regression testing accuracy: ",logreg.score(X_test,Y_test) )

logistic regression training accuracy:  0.55949960907
logistic regression testing accuracy:  0.504871567759


In [21]:
# now try tuning xgboost

gradboost = xgb.XGBClassifier(n_estimators=1000)
gradboost.fit(X_train, Y_train)
Y_pred = gradboost.predict(X_test)
acc_perceptron = round(gradboost.score(X_train, Y_train) * 100, 2)
print("xgboost training accuracy: ",round(gradboost.score(X_train, Y_train), 4))
print("xgboost testing accuracy: ",round(gradboost.score(X_test, Y_test), 4))

print("tune xgboost using GridSearchCV")
print(gradboost.get_params())

xgboost training accuracy:  0.8327
xgboost testing accuracy:  0.775
tune xgboost using GridSearchCV
{'base_score': 0.5, 'colsample_bylevel': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 1000, 'nthread': -1, 'objective': 'multi:softprob', 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': 0, 'silent': True, 'subsample': 1}


In [33]:
# create xg models, does cross validation
def modelfit(alg,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=Y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, Y_train,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
    dtrain_predprob = alg.predict_proba(X_train)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(Y_train.values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(Y_train, dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [34]:
#tune boosting params
from xgboost.sklearn import XGBClassifier
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, X_train)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().