# Chapter 5 

In [None]:
# Listing 1-1

%matplotlib inline

import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import Image
from matplotlib.pylab import rcParams

from sklearn import metrics
from sklearn.cross_validation import train_test_split

from sklearn.decomposition import PCA
from sklearn import kernel_approximation
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.kernel_approximation import (RBFSampler,Nystroem)
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

rcParams['figure.figsize'] = 15, 5

In [None]:
# Listing 1-2

data = pd.read_csv('examples/No-show-Issue-Comma-300k.csv')
data.head()

In [None]:
# Listing 1-3

print len(data)

In [None]:
# Listing 1-4

for column in list(data.columns):
    print "{0:25} {1}".format(column, data[column].nunique())

In [None]:
# Listing 1-5

def features_plots(discrete_vars):

    plt.figure(figsize=(15,24.5))

    for i, cv in enumerate(['Age', 'AwaitingTime']):
        plt.subplot(7, 2, i+1)
        plt.hist(data[cv], bins=len(data[cv].unique()))
        plt.title(cv)
        plt.ylabel('Frequency')

    for i, dv in enumerate(discrete_vars):
        plt.subplot(7, 2, i+3)
        data[dv].value_counts().plot(kind='bar', title=dv)
        plt.ylabel('Frequency')

In [None]:
# Listing 1-6

discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes',
                     'Alcoolism', 'HiperTension', 'Handcap', 'Smokes',
                         'Scholarship', 'Tuberculosis', 'Sms_Reminder']

features_plots(discrete_vars)

In [None]:
# Listing 1-7

data[data['Age'] < 0]['Age'].value_counts().sum()

In [None]:
# Listing 1-8

data = data[data['Age'] >= 0]

In [None]:
# Listing 1-9

del data['Handcap']

In [None]:
# Listing 1-10

data['AwaitingTime'] = data['AwaitingTime'].apply(lambda x: abs(x))

In [None]:
# Listing 1-11

dow_mapping = {'Monday' : 0, 'Tuesday' : 1, 'Wednesday' : 2, 'Thursday' : 3, 'Friday' : 4, 'Saturday' : 5, 'Sunday' : 6}
data['DayOfTheWeek'] = data['DayOfTheWeek'].map(dow_mapping)

In [None]:
# Listing 1-12

for field in ['Gender', 'Status']:
    data[field] = pd.Categorical.from_array(data[field]).codes

In [None]:
# Listing 1-13

discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes',
                     'Alcoolism', 'HiperTension', 'Smokes',
                         'Scholarship', 'Tuberculosis', 'Sms_Reminder']

features_plots(discrete_vars)

In [None]:
# Listing 1-14

plt.scatter(data['Age'], data['AwaitingTime'], s=0.5)
plt.title('Scatter plot of Age and Awaiting Time')
plt.xlabel('Age')
plt.ylabel('Awaiting Time')
plt.xlim(0, 120)
plt.ylim(0, 120)

In [None]:
# Listing 1-15

pd.set_option('display.width', 100)
pd.set_option('precision', 3)
correlations = data[['Age', 'AwaitingTime']].corr(method='pearson')
print(correlations)

In [None]:
# Listing 1-16

data_dow_status = data.groupby(['Sms_Reminder', 'Status'])['Sms_Reminder'].count().unstack('Status').fillna(0)
data_dow_status[[0, 1]].plot(kind='bar', stacked=True)
plt.title('Frequency of people showing up and not showing up by number of SMS reminders sent')
plt.xlabel('Number of SMS reminders')
plt.ylabel('Frequency')

In [None]:
# Listing 1-17

data_dow_status = data.groupby(['DayOfTheWeek', 'Status'])['DayOfTheWeek'].count().unstack('Status').fillna(0)
data_dow_status[[0, 1]].plot(kind='bar', stacked=True)
plt.title('Frequency of people showing up and not showing up by Day of the week')
plt.xlabel('Day of the week')
plt.ylabel('Frequency')

In [None]:
# Listing 1-18

data.boxplot(column=['Age'], return_type='axes', by='Status')
plt.show()

In [None]:
# Listing 1-19

plt.figure(figsize=(15,3.5))

for i, status in enumerate(['no show ups', 'show ups']):
    
    data_show = data[data['Status']==i]
    plt.subplot(1, 2, i+1)
    
    for gender in [0, 1]:
        data_gender = data_show[data_show['Gender']==gender]
        freq_age = data_gender['Age'].value_counts().sort_index()
        freq_age.plot()

    plt.title('Age wise frequency of patient %s for both genders'%status)
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.legend(['Female', 'Male'], loc='upper left')

In [None]:
# Listing 1-20

data.boxplot(column=['AwaitingTime'], return_type='axes', by='Status')
plt.show()

In [None]:
# Listing 1-21

for col in ['AppointmentRegistration', 'ApointmentData']:
    for index, component in enumerate(['year', 'month', 'day']):
        data['%s_%s'%(col, component)] = data[col].apply(lambda x: int(x.split('T')[0].split('-')[index]))

In [None]:
# Listing 1-22

for index, component in enumerate(['hour', 'min', 'sec']):
    data['%s_%s'%('AppointmentRegistration', component)] = data['AppointmentRegistration'].apply(
	lambda x: int(x.split('T')[1][:-1].split(':')[index]))

In [None]:
# Listing 1-23

data.head()

In [None]:
# Listing 1-24

def model_performance(model_name, X_train, y_train, y_test, Y_pred):

    print 'Model name: %s'%model_name
    print 'Test accuracy (Accuracy Score): %f'%metrics.accuracy_score(y_test, Y_pred)
    print 'Test accuracy (ROC AUC Score): %f'%metrics.roc_auc_score(y_test, Y_pred)
    print 'Train accuracy: %f'%clf.score(X_train, y_train)

    fpr, tpr, thresholds = metrics.precision_recall_curve(y_test, Y_pred)
    print 'Area Under the Precision-Recall Curve: %f'%metrics.auc(fpr, tpr)
    
    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, Y_pred)
    roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate, true_positive_rate, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
# Listing 1-25

features_of_choice = [u'Age', u'Gender', 'DayOfTheWeek', 'Diabetes', 'Alcoolism', 'HiperTension',
                        'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder',
                        'AwaitingTime', 'AppointmentRegistration_year', 'AppointmentRegistration_month',
                        'AppointmentRegistration_day', 'ApointmentData_year', 'ApointmentData_month',
                        'ApointmentData_day', 'AppointmentRegistration_hour', 'AppointmentRegistration_min',
                        'AppointmentRegistration_sec']


x = np.array(data[features_of_choice])
y = np.array(data['Status'])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
# Listing 1-26

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

In [None]:
# Listing 1-27

y_pred = clf.predict(x_test)
model_performance('Decision tree classifier', x_train, y_train, y_test, y_pred)

In [None]:
# Listing 1-28

rbf_feature = kernel_approximation.RBFSampler(gamma=1, random_state=1)
X_train = rbf_feature.fit_transform(x_train)

clf = SGDClassifier()
clf.fit(X_train, y_train)

In [None]:
# Listing 1-29

X_test = rbf_feature.fit_transform(x_test)
Y_pred = clf.predict(X_test)
model_performance('Kernel approximation', X_train, y_train, y_test, Y_pred)

In [None]:
# Listing 1-30

clf = RandomForestClassifier()
clf.fit(x_train, y_train)

In [None]:
# Listing 1-31

y_pred = clf.predict(x_test)
model_performance('Random Forest', x_train, y_train, y_test, y_pred)

In [None]:
# Listing 1-32

clf = GradientBoostingClassifier(random_state=10, learning_rate=0.1, n_estimators=200, max_depth=5, max_features=10)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

In [None]:
# Listing 1-33

model_performance('Gradient Boosting', x_train, y_train, y_test, y_pred)