In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn import svm
from sklearn import linear_model, datasets 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import math
import operator 
from collections import Counter

#### Importing the titanic dataset 

In [None]:
data = pd.read_csv('train.csv',header = None)

## Problem 1 (Logistical Regression Default) 

In [None]:
# Picking the parameters class, gender, age
y = data.ix[0:len(data),1].values 
X = data.iloc[0:len(data), [2,4,5]].values

In [None]:
# Splitting the data set 70, 15, 15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_test, X_dev, y_test, y_dev = train_test_split(X_test,y_test,test_size=0.50, random_state=42)

In [None]:
# Fitting the model and printing out the accuracey and f1 score for the dev set predictions 
model = LogisticRegression()
model.fit(X_train, y_train)
IsDeadOrNah = model.predict(X_dev)
print(accuracy_score(y_dev, IsDeadOrNah))
print(f1_score(y_dev, IsDeadOrNah, average = 'binary'))

0.792307692308
0.732673267327


## Problem 2 (Logistical Regression Tweaked) 

In [None]:
# Picking the parameters class, gender, age
y = data.ix[0:len(data),1].values 
X = data.iloc[0:len(data), [2,4,5]].values

In [None]:
# Splitting the data set 70, 15, 15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_test, X_dev, y_test, y_dev = train_test_split(X_test,y_test,test_size=0.50, random_state=42)

In [None]:
# Fitting the model and printing out the accuracey and f1 score for the dev set predictions
model = LogisticRegression(C = 0.5, penalty = 'l1', class_weight = 'balanced')
model.fit(X_train, y_train)
IsDeadOrNah = model.predict(X_dev)
print(accuracy_score(y_dev, IsDeadOrNah))
print(f1_score(y_dev, IsDeadOrNah, average = 'binary'))

0.761538461538
0.730434782609


## Problem 3 (KNN) 

In [None]:
# Picking the parameters class, gender, age
#Added label to the end of X for ease of use, ignored in most of the code
y = data.ix[0:len(data),1].values 
X = data.iloc[0:len(data), [2,4,5,1]].values

In [None]:
# Splitting the data set 70, 15, 15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_test, X_dev, y_test, y_dev = train_test_split(X_test,y_test,test_size=0.50, random_state=42)

In [None]:
#method to get the distance between two points
def Distance(point1, point2, length):
    distance = 0
    for x in range(length):
        distance += math.sqrt(pow(point1[x] - point2[x],2))
        
    return distance

#main kNN algorithm uses the distance method to pair up points to their respective neighbors 
def kNN(training, test, k):
    distances = []
    for x in range(len(training)):
        tempDistance = Distance(test, training[x], len(test)-1)
        distances.append((training[x], tempDistance))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

#secondary method used to improve the accuracy by voting on which neighbors are pair                                            
def Vote(neighbors):
    votes = {}
    for x in range(len(neighbors)):
        neighbor = neighbors[x][-1]
        if neighbor in votes:
            votes[neighbor] += 1
        else:
            votes[neighbor] = 1
    CollectVotes = sorted(votes.items(), key=operator.itemgetter(1), reverse=True)
    return CollectVotes[0][0]

#method to test the accuracy 
def getAccuracy(testSet, predictions):              
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) 


In [None]:
#running all the methods 
kResult =[]
for n in range(1, 40,2):
    correct=0
    predictions=[]
    for x in range(len(X_dev)):
        neighbors = kNN(X_train, X_dev[x], n)
        result = Vote(neighbors)
        predictions.append(result)
    for x in range(len(X_dev)):
        if X_dev[x][-1] == predictions[x]:
            correct += 1
    acc = (correct/float(len(X_dev)))
    kResult.append([n,acc])

#plotting peak k value preformance 
kResult = pd.DataFrame(kResult, columns=["n", "acc"])
plt.plot(kResult.n, kResult.acc)
plt.title("Accuracy with K")
plt.show()

## Problem 4 (kNN vs Logistical Regression)

In [None]:
# Picking the parameters class, gender, age
y = data.ix[0:len(data),1].values 
X = data.iloc[0:len(data), [2,4,5]].values

# Splitting the data set 70, 15, 15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_test, X_dev, y_test, y_dev = train_test_split(X_test,y_test,test_size=0.50, random_state=42)

In [None]:
# Fitting the model and printing out the accuracey and f1 score for the dev set predictions 
model = LogisticRegression()
model.fit(X_train, y_train)
IsDeadOrNah = model.predict(X_test)
print(accuracy_score(y_test, IsDeadOrNah))
print(f1_score(y_test, IsDeadOrNah, average = 'binary'))

In [None]:
# Picking the parameters class, gender, age
y = data.ix[0:len(data),1].values 
X = data.iloc[0:len(data), [2,4,5,1]].values

# Splitting the data set 70, 15, 15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_test, X_dev, y_test, y_dev = train_test_split(X_test,y_test,test_size=0.50, random_state=42)

In [None]:
correct=0
k=9
predictions=[]
for x in range(len(X_test)):
    neighbors = kNN(X_test, X_test[x], k)
    result = Vote(neighbors)
    predictions.append(result)
    
print(accuracy_score(y_test, predictions))
print (f1_score(y_test, predictions, average = 'binary'))