# Evaluation and Validation Lectures

In [11]:
import numpy as np
import pandas as pd

## Loading data and some basic data management

In [2]:
# Load the dataset
X = pd.read_csv('titanic_data.csv')
# Limit to numeric data
X = X._get_numeric_data()
# Separate the labels
y = X['Survived']
# Remove labels from the inputs, and age due to missing data
del X['Age'], X['Survived']





## Importing proper sklearn modules and splitting our data into training and test sets

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)


### 1. Accuracy of prediciting itself

In [6]:
# The decision tree classifier
clf1 = DecisionTreeClassifier()
clf1.fit(X,y)
print "Decision Tree has accuracy: ",accuracy_score(y, clf1.predict(X))
# The naive Bayes classifier

clf2 = GaussianNB()
clf2.fit(X,y)
print "GaussianNB has accuracy: ",accuracy_score(y, clf2.predict(X))

answer = { 
 "Naive Bayes Score": 0, 
 "Decision Tree Score": 0
}

 Decision Tree has accuracy:  1.0
GaussianNB has accuracy:  0.677890011223


### 2. Accuracy of training set models on test set predicitions

In [7]:
# The decision tree classifier
clf1 = DecisionTreeClassifier()
clf1.fit(X_train,y_train)
print "Decision Tree has accuracy: ",accuracy_score(y_test, clf1.predict(X_test))
# The naive Bayes classifier

clf2 = GaussianNB()
clf2.fit(X_train,y_train)
print "GaussianNB has accuracy: ",accuracy_score(y_test, clf2.predict(X))

answer = { 
 "Naive Bayes Score": 0, 
 "Decision Tree Score": 0
}

Decision Tree has accuracy:  0.647058823529
GaussianNB has accuracy:  0.672268907563


### 3. Confusion matrix of training set models on test set predicitions

In [10]:
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y)

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

clf1 = DecisionTreeClassifier()
clf1.fit(X_train,y_train)
dt_cm = confusion_matrix(y_test,clf1.predict(X_test))
print "Confusion matrix for this Decision Tree:\n",dt_cm

clf2 = GaussianNB()
clf2.fit(X_train,y_train)
nb_cm = confusion_matrix(y_test,clf1.predict(X_test))
print "GaussianNB confusion matrix:\n",nb_cm 

#TODO: store the confusion matrices on the test sets below

confusions = {
 "Naive Bayes": nb_cm ,
 "Decision Tree": dt_cm
}

Confusion matrix for this Decision Tree:
[[163  58]
 [ 66  70]]
GaussianNB confusion matrix:
[[163  58]
 [ 66  70]]


### 4. Precision and Recall of traing set models on test set predicitons

In [21]:
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision

clf1 = DecisionTreeClassifier()
clf1.fit(X_train, y_train)
clf1_recall = recall(y_test,clf1.predict(X_test))
clf1_precision = precision(y_test,clf1.predict(X_test))
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(clf1_recall,clf1_precision)

clf2 = GaussianNB()
clf2.fit(X_train, y_train)
clf2_recall = recall(y_test,clf2.predict(X_test))
clf2_precision = precision(y_test,clf2.predict(X_test))
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(clf2_recall,clf2_precision)

results = {
  "Naive Bayes Recall": clf2_recall,
  "Naive Bayes Precision": clf2_precision,
  "Decision Tree Recall": clf1_recall,
  "Decision Tree Precision": clf1_recall
}

Decision Tree recall: 0.49 and precision: 0.56
GaussianNB recall: 0.38 and precision: 0.61


### 5. F1-score of training set models on test set predictions

In [24]:
from sklearn.metrics import f1_score

clf1 = DecisionTreeClassifier()
clf1.fit(X_train, y_train)
clf1_f1 = f1_score(y_test, clf1.predict(X_test))
print "Decision Tree F1 score: {:.2f}".format(clf1_f1)

clf2 = GaussianNB()
clf2.fit(X_train, y_train)
clf2_f1 = f1_score(y_test, clf2.predict(X_test))
print "GaussianNB F1 score: {:.2f}".format(clf2_f1)

F1_scores = {
 "Naive Bayes": clf2_f1,
 "Decision Tree": clf2_f1
}


Decision Tree F1 score: 0.48
GaussianNB F1 score: 0.45


### 6. New imported dataset from sklearn for studying errors in linear regression. Finding the mean absolute error of predictions using a train-test split.

In [25]:
# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)


reg1 = DecisionTreeRegressor()
reg1.fit(X_train, y_train)
dt_mae = mae(y_test,reg1.predict(X_test))
print "Decision Tree mean absolute error: {:.2f}".format(dt_mae)

reg2 = LinearRegression()
reg2.fit(X_train, y_train)
lr_mae= mae(y_test,reg2.predict(X_test))
print "Linear regression mean absolute error: {:.2f}".format(lr_mae)

results = {
 "Linear Regression": lr_mae,
 "Decision Tree": dt_mae
}

Decision Tree mean absolute error: 13.93
Linear regression mean absolute error: 7.79


### 7. Mean squared error

In [26]:
from sklearn.metrics import mean_squared_error as mse

reg1 = DecisionTreeRegressor()
reg1.fit(X_train, y_train)
dt_mse = mse(y_test, reg1.predict(X_test))
print "Decision Tree mean absolute error: {:.2f}".format(dt_mse)

reg2 = LinearRegression()
reg2.fit(X_train, y_train)
lr_mse = mse(y_test, reg2.predict(X_test))
print "Linear regression mean absolute error: {:.2f}".format(lr_mse)

results = {
 "Linear Regression": lr_mse,
 "Decision Tree": dt_mse
}

Decision Tree mean absolute error: 335.13
Linear regression mean absolute error: 160.63
