In [1]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from scipy import stats

In [2]:
data = np.loadtxt("Titanic_train.csv", delimiter = ",", dtype = str, skiprows = 1, usecols = [1, 2, 5, 6, 10])
data

array([['0', '3', 'male', '22', '7.25'],
       ['1', '1', 'female', '38', '71.2833'],
       ['1', '3', 'female', '26', '7.925'],
       ...,
       ['0', '3', 'female', '', '23.45'],
       ['1', '1', 'male', '26', '30'],
       ['0', '3', 'male', '32', '7.75']], dtype='<U8')

In [3]:
malemask = data[:, 2] == "male"
femalemask = data[:, 2] == "female"

data[(malemask), 2] = "0"
data[(femalemask), 2] = "1"

emptymask = data[:, 3] == ''
data = data[~(emptymask)].astype(float)
for col in range(len(data[0])):
    data[:, col] = (data[:, col] - np.min(data[:, col]))/(np.max(data[:, col]) - np.min(data[:, col]))
data

array([[0.        , 1.        , 0.        , 0.27117366, 0.01415106],
       [1.        , 0.        , 1.        , 0.4722292 , 0.13913574],
       [1.        , 1.        , 1.        , 0.32143755, 0.01546857],
       ...,
       [1.        , 0.        , 1.        , 0.23347575, 0.0585561 ],
       [1.        , 0.        , 0.        , 0.32143755, 0.0585561 ],
       [0.        , 1.        , 0.        , 0.39683338, 0.01512699]])

In [16]:
"""
emptymask = data[:,3] == ""
data[(emptymask), 3] = np.mean(pd.to_numeric(data[:, 3], errors='coerce')[~np.isnan(pd.to_numeric(data[:, 3], errors='coerce'))])
data
"""

array([['0', '3', '0', '22', '7.25'],
       ['1', '1', '1', '38', '71.2833'],
       ['1', '3', '1', '26', '7.925'],
       ...,
       ['1', '1', '1', '19', '30'],
       ['1', '1', '0', '26', '30'],
       ['0', '3', '0', '32', '7.75']], dtype='<U8')

In [4]:
x = np.c_[np.ones(len(data)), data[:, 1:]]
y = data[:, 0]
x

array([[1.        , 1.        , 0.        , 0.27117366, 0.01415106],
       [1.        , 0.        , 1.        , 0.4722292 , 0.13913574],
       [1.        , 1.        , 1.        , 0.32143755, 0.01546857],
       ...,
       [1.        , 0.        , 1.        , 0.23347575, 0.0585561 ],
       [1.        , 0.        , 0.        , 0.32143755, 0.0585561 ],
       [1.        , 1.        , 0.        , 0.39683338, 0.01512699]])

In [12]:
# Starting Weights
w = [0, 0, 0, 0, 0]

lr = 0.01

# Cost Function Graphing
costx = []
costy = []

# Flag for loop
flag = True

# Iteration countter
c = 0

while flag:
    c += 1
    
    # hw(X)
    pred = np.divide(1, 1+np.exp(-1 * np.dot(x, w)))
    diff = pred - y
    grad = np.dot(diff, x)/len(x)
    
    # cost function
    cost = np.sum(np.dot(y, np.log(pred)) + np.dot(1-y, np.log(1-pred)))
    
    # updating
    w = w - lr * grad
    
    costx.append(c)
    costy.append(cost)
    
    # break condition
    if c > 1000000 or np.linalg.norm(grad) < 0.000001:
        break

In [13]:
w

array([ 1.18449526, -2.5393318 ,  2.51818272, -2.92104009,  0.27620601])

In [14]:
costy

[-494.907086919801,
 -494.6454982893871,
 -494.3849803214351,
 -494.12552533007795,
 -493.867125690629,
 -493.6097738391281,
 -493.3534622718935,
 -493.0981835450729,
 -492.84393027419935,
 -492.5906951337485,
 -492.3384708566981,
 -492.087250234091,
 -491.83702611459927,
 -491.58779140409194,
 -491.3395390652045,
 -491.09226211691146,
 -490.8459536341004,
 -490.60060674714987,
 -490.3562146415081,
 -490.112770557276,
 -489.87026778879135,
 -489.6286996842156,
 -489.38805964512363,
 -489.1483411260958,
 -488.9095376343122,
 -488.6716427291499,
 -488.43465002178186,
 -488.19855317477936,
 -487.96334590171557,
 -487.7290219667734,
 -487.4955751843538,
 -487.26299941868785,
 -487.03128858345076,
 -486.8004366413789,
 -486.5704376038883,
 -486.3412855306964,
 -486.1129745294458,
 -485.885498755331,
 -485.6588524107267,
 -485.43302974481935,
 -485.20802505324025,
 -484.9838326777023,
 -484.7604470056377,
 -484.5378624698393,
 -484.3160735481034,
 -484.0950747628756,
 -483.8748606808984,
 -4

In [25]:
fig = make_subplots(rows = 1, cols = 1)
costxnew = costx[::1000]
costynew = costy[::1000]
fig.add_trace(go.Scatter(x = costxnew, y = costynew, mode = "markers+lines", name = "Real"), row = 1, col = 1)
fig.show()

In [19]:
pred = np.divide(1, 1+np.exp(-1 * np.dot(x, w)))
final_pred = np.around(pred)
final_pred

array([0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1.,
       0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1.,
       1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0.,
       1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0.,
       0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1.,
       0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
       0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0.,
       1., 0., 0., 0., 0.

In [20]:
ymask = y == final_pred
sum(ymask)/len(y) * 100

79.27170868347339

Test Data

In [21]:
data = np.loadtxt("Titanic_test.csv", delimiter = ",", dtype = str, skiprows = 1, usecols = [1, 4, 5, 9])
data

array([['3', 'male', '34.5', '7.8292'],
       ['3', 'female', '47', '7'],
       ['2', 'male', '62', '9.6875'],
       ...,
       ['3', 'male', '38.5', '7.25'],
       ['3', 'male', '', '8.05'],
       ['3', 'male', '', '22.3583']], dtype='<U8')

In [22]:
malemask = data[:, 1] == "male"
femalemask = data[:, 1] == "female"

data[(malemask), 1] = "0"
data[(femalemask), 1] = "1"

emptymask = data[:,2] == ""
data[(emptymask), 2] = np.mean(pd.to_numeric(data[:, 2], errors='coerce')[~np.isnan(pd.to_numeric(data[:, 2], errors='coerce'))])

emptymask = data[:,3] == ""
data[(emptymask), 3] = np.mean(pd.to_numeric(data[:, 3], errors='coerce')[~np.isnan(pd.to_numeric(data[:, 3], errors='coerce'))])

data = data.astype(float)
for col in range(len(data[0])):
    data[:, col] = (data[:, col] - np.min(data[:, col]))/(np.max(data[:, col]) - np.min(data[:, col]))
data

array([[1.        , 0.        , 0.4527232 , 0.01528158],
       [1.        , 1.        , 0.61756561, 0.01366309],
       [0.5       , 0.        , 0.8153765 , 0.01890874],
       ...,
       [1.        , 0.        , 0.50547277, 0.01415106],
       [1.        , 0.        , 0.39697468, 0.01571255],
       [1.        , 0.        , 0.39697468, 0.0436405 ]])

In [24]:
x = np.c_[np.ones(len(data)), data]
pred = np.divide(1, 1+np.exp(-1 * np.dot(x, w)))
final_pred = np.around(pred)
print(final_pred)

[0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0.
 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0.
 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0.
 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.
 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0.
 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1.
 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1.
 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.

In [None]:
[0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0.
 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0.
 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0.
 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.
 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0.
 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1.
 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1.
 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1.
 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1.
 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 0. 1. 0. 0. 0.]