In [14]:
import numpy as np
import pandas as pd
from collections import Counter

In [15]:
trainSet = pd.read_csv("titanic/train.csv")
testSet = pd.read_csv("titanic/test.csv")

In [16]:
trainSet.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
trainSet.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

From the dataset we first remove all the features that are not related to our classification model. From straightforward observation we can see that the features *PassengerId*, *Name*, *SibSp*, *Parch* and *Ticket* are completely irrelevant to our model hence we remove those.
We also remove the *Cabin* feature as it contains a huge number of null values and we do not have any possible way of assigning a global or local value to it.

In [18]:
trainSet = trainSet.drop(columns=['PassengerId','Name','SibSp','Parch','Ticket','Cabin'])
testSet = testSet.drop(columns=['PassengerId','Name','SibSp','Parch','Ticket','Cabin'])

trainSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(2), object(2)
memory usage: 41.8+ KB


Next, we convert the *Sex* attribute into a numerical attribute so as to feed it into the model. For the *Embarked* attribute we have 3 different values as per the dataset description. These are catagorical and hence need to be converted to a one-hot encoding.

In [19]:
trainSet['Sex'] = trainSet['Sex'].map({'male': 0, 'female': 1}).astype(int)
testSet['Sex'] = testSet['Sex'].map({'male': 0, 'female': 1}).astype(int)

In [20]:
trainSet = pd.get_dummies(trainSet)
testSet = pd.get_dummies(testSet)

We also need to fill in missing values for the *Age* column.(We replace the nan values with the mean values of the age column)

In [21]:
trainSet = trainSet.fillna(trainSet.mean())
testSet = testSet.fillna(trainSet.mean())

Next, we split the dataset into X and Y, train and test

In [22]:
X_train = trainSet.drop(columns=['Survived'])
y_train = trainSet['Survived']
X_test = testSet

## Logistic Regression

In [23]:
class LogisticRegression: # we try using an object oriented apprach for this similar to what sklearn does
    def __init__(self, alpha=0.01, num_iter=100000):
        self.alpha = alpha
        self.num_iter = num_iter
    
    def add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def sigmoid(self, z): # helper function to evaluate sigmoid
        return 1 / (1 + np.exp(-z))
    
    def predict_prob(self, X):
        X = self.add_intercept(X)

        return self.sigmoid(np.dot(X, self.theta))
    
    def fit(self, X, y): # fits the model to the data
        X = self.add_intercept(X)
        
        # initializating weights
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.alpha * gradient

    def predict(self, X, threshold=0.1): # predicts y using the fit model
        state = self.predict_prob(X) >= threshold
        for i in state:
            if(i): # for generalisation this can be replaced by 1 and 0
                print("Survived") 
            else:
                print("Not Survived")

In [24]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [25]:
lr.predict(X_test)

Not Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Survived
Survived
Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Survived
Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Survived
Survived
Survived
Survived
Not Survived
Survived
Not Survived
Survived
Not Survived
Survived
Survived
Survived
Not Survived
Not Survived
Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Survived
Surv

## SVM

In [26]:
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values

In [27]:
w = np.zeros(X_train.shape)
epochs = 1
alpha = 0.0001 # learning rate

while(epochs < 10000): # one epoch is one iteration through the complete dataset
    y = np.sum(w * X_train, axis = 1)
    prod = y.reshape((-1,1)) * y_train.reshape((-1,1))
    if(epochs % 1000 == 0):
        print(epochs)
    index = 0
    for val in prod:
        if(val >= 1):
            cost = 0
            w = w - alpha * (2 * 1/epochs * w) # updating the parameters
            
        else:
            cost = 1 - val
            w = w + alpha * (X_train[index] * y_train[index] - 2 * 1/epochs * w) # iterating over all points
        index += 1
    epochs += 1

1000
2000
3000
4000
5000
6000
7000
8000
9000


In [28]:
## Predicting test values
y_pred = np.sum(w[:len(X_test)][:] * X_test,axis=1)

predictions = []
for val in y_pred:
    if(val > 10):
        print("Suvived")
    else:
        print("Not Survived")

Not Survived
Suvived
Suvived
Not Survived
Not Survived
Not Survived
Not Survived
Suvived
Not Survived
Suvived
Not Survived
Suvived
Suvived
Suvived
Suvived
Suvived
Suvived
Not Survived
Not Survived
Suvived
Suvived
Not Survived
Suvived
Suvived
Suvived
Suvived
Suvived
Not Survived
Suvived
Suvived
Suvived
Suvived
Suvived
Suvived
Suvived
Not Survived
Not Survived
Not Survived
Not Survived
Suvived
Suvived
Suvived
Suvived
Not Survived
Suvived
Not Survived
Suvived
Not Survived
Suvived
Suvived
Suvived
Not Survived
Suvived
Suvived
Suvived
Suvived
Not Survived
Not Survived
Suvived
Suvived
Not Survived
Suvived
Not Survived
Not Survived
Suvived
Suvived
Not Survived
Suvived
Suvived
Suvived
Not Survived
Not Survived
Not Survived
Suvived
Suvived
Suvived
Not Survived
Suvived
Not Survived
Not Survived
Not Survived
Suvived
Suvived
Not Survived
Not Survived
Suvived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Suvived
Not Survived
Suvived
Not Survived
Suvived
Not Survived
N

## kNN

In [19]:
def kNearestNeighbors(X_train, y_train, X_test, k=5):
    distances = []
    # calculating the distance of each training tuple from the test tuple
    for i in range(len(y_train)):
        euclidean_distance = np.linalg.norm(np.array(X_train[i])-np.array(X_test))
        distances.append([euclidean_distance,y_train[i]])

    votes = [i[1] for i in sorted(distances)[:k]] # taking the first k distances
    vote_result = Counter(votes).most_common(1)[0][0] # finding the most common class
    return vote_result

In [21]:
# predicting points using datapoints from train set
for x in X_test:
    if(kNearestNeighbors(X_train, y_train, x) == 0):
        print("Not Survived")
    else:
        print("Survived")
    #print("Point " + str(x) + " : Class - " + str(k_nearest_neighbors(X_train.values, y_train.values, x)))

Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Survived
Not Survived
Not Survived
Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Survived
Not Survived
Not Survived
Survived
Survived
Survived
Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Survived
Survived
Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Survived
Not Survived
Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Not Survived
Not Survived
Not Survived
Not Survived
Not Survived
Survived
Survived
Survived
Not Survived
Not Survived
Not Survived
Su