# Model Selection

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3 as sql3
import random

random.seed(1000)
np.random.seed(1000)

sns.set(style="darkgrid")
ROOT_PATH="C:/Users/moshe/Documents/projectML/p1"
DATA_PATH = "%s/data/raw" % ROOT_PATH
DB_FILE = "%s/hairsalon.db" % DATA_PATH
def readSQL(query):
    import pandas as pd
    import sqlite3 as sql3
    db = sql3.connect(DB_FILE)
    df = pd.read_sql_query(query, db)
    db.close()
    return(df)

db=sql3.connect(DB_FILE)

In [3]:
def readXy(query):
    df=readSQL(query)
    for c in df.columns:
        if df[c].nunique() < 14:
            df[c] = df[c].astype('category')
    X=df.loc[:,df.columns[1:]]
    y=df['noshowflag']
    return X,y

query="""select * from test"""
Xtest,ytest=readXy(query)
query="""select * from fulltrain"""
Xftrain,yftrain=readXy(query)
query="""select * from train"""
Xtrain,ytrain=readXy(query)
query="""select * from dev"""
Xdev,ydev=readXy(query)

DatabaseError: Execution failed on sql 'select * from fulltrain': no such table: fulltrain

## Applying Lasso and Ridge Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
modellasso = LogisticRegression(penalty='l1',random_state=10)
modellasso.fit(Xtrain,ytrain)

In [None]:
print("Accuracy - Train: %s, Dev: %s" % (modellasso.score(Xtrain,ytrain), modellasso.score(Xdev,ydev)))

In [None]:
from sklearn.linear_model import LogisticRegression
modelridge = LogisticRegression(penalty='l2',random_state=10)
modelridge.fit(Xtrain,ytrain)

In [None]:
print("Accuracy - Train: %s, Dev: %s" % (modelridge.score(Xtrain,ytrain), modelridge.score(Xdev,ydev)))

## Applying Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(random_state=10,n_estimators=100,max_depth=2)
rfmod = RandomForestClassifier().fit(Xtrain, ytrain)


In [None]:
print("Accuracy - Train: %s, Dev: %s" % (rfmod.score(Xtrain,ytrain), rfmod.score(Xdev,ydev)))

## Applying XgB

In [None]:
import xgboost as xgb

classifier=xgb.XGBClassifier(random_state=100)
Xtrainnum=Xtrain.copy()
columnstoconvert=Xtrainnum.loc[:,Xtrainnum.dtypes=='category'].columns
Xtrainnum[columnstoconvert]=Xtrainnum[columnstoconvert].astype(int)
ytrainnum=ytrain.copy().astype(int)
xgbmod=classifier.fit(Xtrainnum,ytrainnum)
Xdevnum=Xdev.copy()
columnstoconvert=Xdevnum.loc[:,Xdevnum.dtypes=='category'].columns
Xdevnum[columnstoconvert]=Xdevnum[columnstoconvert].astype(int)
ydevnum=ydev.copy().astype(int)
ypred=xgbmod.predict(Xdevnum)

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy - Train: %s, Dev: %s" % (accuracy_score(ytrainnum,xgbmod.predict(Xtrainnum))
                                        ,accuracy_score(ydevnum,ypred)))

## Applying SVM Classifier

In [None]:
from sklearn.svm import SVC
svmmod = SVC(C=0.01,gamma='auto',probability=True,random_state=100).fit(Xtrain, ytrain)

In [None]:
print("Accuracy - Train: %s, Dev: %s" % (svmmod.score(Xtrain,ytrain), svmmod.score(Xdev,ydev)))

## Applying Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adamod=AdaBoostClassifier(random_state=1000).fit(Xtrain,ytrain)


In [None]:
print("Accuracy - Train: %s, Dev: %s" % (adamod.score(Xtrain,ytrain), adamod.score(Xdev,ydev)))

## Applying GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbmod = GradientBoostingClassifier(random_state=1000).fit(Xtrain, ytrain)


In [None]:
print("Accuracy - Train: %s, Dev: %s" % (gbmod.score(Xtrain,ytrain), gbmod.score(Xdev,ydev)))

from sklearn.metrics import roc_curve, auc
pred1 = modellasso.predict_proba(Xdev)
fpr1, tpr1, thresholds = roc_curve(ydev.values, pred1[:,1],pos_label='1')

pred2 = modelridge.predict_proba(Xdev)
fpr2, tpr2, thresholds = roc_curve(ydev.values, pred2[:,1],pos_label='1')

#random forest
pred3=rfmod.predict_proba(Xdev)
fpr3, tpr3, thresholds = roc_curve(ydev.values, pred3[:,1],pos_label='1')

#xgb
pred4=xgbmod.predict_proba(Xdevnum)
fpr4, tpr4, thresholds = roc_curve(ydevnum.values, pred4[:,1],pos_label=1)

#SVM
pred5=svmmod.predict_proba(Xdevnum)
fpr5, tpr5, thresholds = roc_curve(ydev.values, pred5[:,1],pos_label='1')

#adaboost
pred6=adamod.predict_proba(Xdevnum)
fpr6, tpr6, thresholds = roc_curve(ydev.values, pred6[:,1],pos_label='1')

#gradient boosting
pred7=gbmod.predict_proba(Xdevnum)
fpr7, tpr7, thresholds = roc_curve(ydev.values, pred7[:,1],pos_label='1')


import matplotlib.pyplot as plt
roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)
roc_auc3 = auc(fpr3, tpr3)
roc_auc4 = auc(fpr4, tpr4)
roc_auc5 = auc(fpr5, tpr5)
roc_auc6 = auc(fpr6, tpr6)
roc_auc7 = auc(fpr7, tpr7)


x=np.linspace(0,1)
y=x

print(roc_auc1,roc_auc2,roc_auc3,roc_auc4)
fig, ax = plt.subplots(figsize=(8, 8))


plt.title("ROC curve - model comparison",fontsize='xx-large')
plt.xlabel("FPR",fontsize='x-large')
plt.ylabel("TPR",rotation=0,fontsize='x-large')
plt.plot(x,y,linestyle='--')
plt.plot(fpr3, tpr3, lw=2, label='ROC curve Random Forest (AUC = %0.3f)' %roc_auc3)
plt.plot(fpr7, tpr7, lw=1, label='ROC curve gradient boosting (AUC = %0.3f)' %roc_auc7)
plt.plot(fpr4, tpr4, lw=1, label='ROC curve Xgb (AUC = %0.3f)' %roc_auc4)
plt.plot(fpr6, tpr6, lw=1, label='ROC curve adaboost (AUC = %0.3f)' %roc_auc6)
plt.plot(fpr1, tpr1, lw=1, label='ROC curve Logistic Regression l1 (AUC = %0.3f)' %roc_auc1)
plt.plot(fpr2, tpr2, lw=1, label='ROC curve Logistic Regression l2 (AUC = %0.3f)' %roc_auc2)
plt.plot(fpr5, tpr5, lw=1, label='ROC curve SVM Classifier (AUC = %0.3f)' %roc_auc5)
plt.legend(loc='best',fontsize='x-large',framealpha=1,fancybox=True,edgecolor='Black',facecolor='White')
plt.show()