In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [3]:
def compute_cost(x, y, w, b):
    m, n = x.shape
    
    z = np.dot(x,w) + b
    f_wb = sigmoid(z)
    
    cost = -(1.0 / m) * np.sum((y * np.log(f_wb)) + (1 - y) * np.log(1 - f_wb))
    return cost

In [4]:
def compute_gradient_descent(x, y, w, b):
    m, n = x.shape
    
    dw = np.zeros((n, ))
    db = 0.0
    for i in range(m):
        f_wb_i = sigmoid(np.dot(x[i], w) + b)
        err = f_wb_i - y[i]
        for j in range(n):
            dw[j] = dw[j] + err * x[i, j]
        db = db + err
    dw = dw / m
    db = db / m
    return dw, db

In [5]:
def gradient_descent(x, y):
    m, n = x.shape
    
    w = np.zeros((n ,))
    b = 0.0
    alpha = 1e-2
    max_iter = 10000
    
    losses = []
    for i in range(max_iter):
        cost = compute_cost(x, y, w, b)
        losses.append(cost)
        
        if i%50 == 0:
            print(f"Iteration: {i}, Cost: {losses[i]}")
        dw, db = compute_gradient_descent(x, y, w, b)
        
        w = w - (alpha * dw)
        b = b - (alpha * db)
    return w, b

In [6]:
df = pd.read_csv('train.csv')

In [7]:
df['Sex'].replace({
    'male' : 1,
    'female' : 0
}, inplace=True)

In [8]:
df['Age'].replace({
    np.nan : np.sum(df['Age'])/ df.shape[0]
}, inplace=True)

In [9]:
df['Age'].unique()

array([22.        , 38.        , 26.        , 35.        , 23.79929293,
       54.        ,  2.        , 27.        , 14.        ,  4.        ,
       58.        , 20.        , 39.        , 55.        , 31.        ,
       34.        , 15.        , 28.        ,  8.        , 19.        ,
       40.        , 66.        , 42.        , 21.        , 18.        ,
        3.        ,  7.        , 49.        , 29.        , 65.        ,
       28.5       ,  5.        , 11.        , 45.        , 17.        ,
       32.        , 16.        , 25.        ,  0.83      , 30.        ,
       33.        , 23.        , 24.        , 46.        , 59.        ,
       71.        , 37.        , 47.        , 14.5       , 70.5       ,
       32.5       , 12.        ,  9.        , 36.5       , 51.        ,
       55.5       , 40.5       , 44.        ,  1.        , 61.        ,
       56.        , 50.        , 36.        , 45.5       , 20.5       ,
       62.        , 41.        , 52.        , 63.        , 23.5 

In [10]:
X = np.array(df[['Pclass', 'Sex', 'Age']])

In [11]:
Y = np.array(df['Survived'])

In [12]:
w, b = gradient_descent(X, Y)

Iteration: 0, Cost: 0.6931471805599454
Iteration: 50, Cost: 0.7197046363769347
Iteration: 100, Cost: 0.7123891153214511
Iteration: 150, Cost: 0.7060111910817716
Iteration: 200, Cost: 0.7001875325527279
Iteration: 250, Cost: 0.6946920133820781
Iteration: 300, Cost: 0.6894010046591178
Iteration: 350, Cost: 0.6842502603280775
Iteration: 400, Cost: 0.679207834083961
Iteration: 450, Cost: 0.6742584412222787
Iteration: 500, Cost: 0.6693948797793711
Iteration: 550, Cost: 0.6646135351668581
Iteration: 600, Cost: 0.6599121494262399
Iteration: 650, Cost: 0.6552887990574339
Iteration: 700, Cost: 0.6507414898000847
Iteration: 750, Cost: 0.6462680476315923
Iteration: 800, Cost: 0.6418661383798187
Iteration: 850, Cost: 0.6375333324498795
Iteration: 900, Cost: 0.633267175921015
Iteration: 950, Cost: 0.6290652521817294
Iteration: 1000, Cost: 0.6249252293890687
Iteration: 1050, Cost: 0.620844893993409
Iteration: 1100, Cost: 0.6168221724408116
Iteration: 1150, Cost: 0.6128551435848483
Iteration: 1200, C

Iteration: 9750, Cost: 0.47173901897186316
Iteration: 9800, Cost: 0.47160920869734047
Iteration: 9850, Cost: 0.4714803975590844
Iteration: 9900, Cost: 0.4713525763780977
Iteration: 9950, Cost: 0.4712257360923573


In [13]:
y_pred = sigmoid(np.dot(X, w) + b)

In [14]:
output = []
m = X.shape[0]
for i in range(m):
    value = sigmoid(np.dot(X[i], w) + b)
    if value >= 0.65:
        output.append(1)
    else:
        output.append(0)

In [15]:
output = np.array(output)

In [16]:
output.shape

(891,)

In [17]:
count_wrong = 0
for i in range(m):
    if output[i] != Y[i]:
        count_wrong +=1

In [18]:
count_wrong

190

In [19]:
((m - count_wrong) / m) * 100

78.67564534231201

In [20]:
test = pd.read_csv('test.csv')

In [21]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [22]:
test['Sex'].replace({
    'male': 1,
    'female' : 0
}, inplace=True)

In [23]:
test['Age'].replace({
    np.nan : np.sum(test['Age']) / test.shape[0]
}, inplace=True)

In [24]:
test['Age'].unique()

array([34.5       , 47.        , 62.        , 27.        , 22.        ,
       14.        , 30.        , 26.        , 18.        , 21.        ,
       24.04425837, 46.        , 23.        , 63.        , 24.        ,
       35.        , 45.        , 55.        ,  9.        , 48.        ,
       50.        , 22.5       , 41.        , 33.        , 18.5       ,
       25.        , 39.        , 60.        , 36.        , 20.        ,
       28.        , 10.        , 17.        , 32.        , 13.        ,
       31.        , 29.        , 28.5       , 32.5       ,  6.        ,
       67.        , 49.        ,  2.        , 76.        , 43.        ,
       16.        ,  1.        , 12.        , 42.        , 53.        ,
       26.5       , 40.        , 61.        , 60.5       ,  7.        ,
       15.        , 54.        , 64.        , 37.        , 34.        ,
       11.5       ,  8.        ,  0.33      , 38.        , 57.        ,
       40.5       ,  0.92      , 19.        , 36.5       ,  0.75

In [25]:
X_test = np.array(test[['Pclass', 'Sex', 'Age']])

In [26]:
output_test = []
for i in range(X_test.shape[0]):
    value = sigmoid(np.dot(X_test[i], w) + b)
    if value >= 0.65:
        output_test.append(1)
    else:
        output_test.append(0)
output_test = np.array(output_test)

In [27]:
passengerId = np.array(test['PassengerId'])

In [28]:
final_df = pd.DataFrame([passengerId, output_test])

In [29]:
final_df = final_df.T 

In [30]:
final_df

Unnamed: 0,0,1
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [31]:
final_df.rename(columns = {0:'PassengerId', 1:'Survived'}, inplace = True)

In [32]:
final_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [33]:
final_df.to_csv('gender_submission.csv', index=False)

In [39]:
output_test = list(output_test)

In [40]:
output_test.count(1)

80

In [41]:
output_test.count(0)

338