# NN from scratch with Titanic data set

In [1]:
import torch, numpy as np, pandas as pd
df = pd.read_csv("train.csv")
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [2]:
modes = df.mode().iloc[0]
modes

PassengerId                      1
Survived                       0.0
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

In [3]:
df.fillna(modes, inplace=True)
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [4]:
# log of fare 
df['LogFare'] = np.log(df['Fare']+1)

In [5]:
df = pd.get_dummies(df, columns=["Sex","Pclass","Embarked"])
df.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'LogFare', 'Sex_female', 'Sex_male', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [6]:
from torch import tensor

# dependent variable Y is survived 
t_dep = tensor(df["Survived"])

# independent variably X are the rest (exclude str vars: name, ticket, cabin)
cols = ['Age', 'SibSp', 'Parch', 'LogFare', 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

t_indep = tensor(df[cols].astype(float).values, dtype=torch.float)
t_indep

tensor([[22.,  1.,  0.,  ...,  0.,  0.,  1.],
        [38.,  1.,  0.,  ...,  1.,  0.,  0.],
        [26.,  0.,  0.,  ...,  0.,  0.,  1.],
        ...,
        [24.,  1.,  2.,  ...,  0.,  0.,  1.],
        [26.,  0.,  0.,  ...,  1.,  0.,  0.],
        [32.,  0.,  0.,  ...,  0.,  1.,  0.]])

In [7]:
df[["Embarked_C","Embarked_Q"]].astype(int)

df[cols].head()

Unnamed: 0,Age,SibSp,Parch,LogFare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,2.110213,False,True,False,False,True,False,False,True
1,38.0,1,0,4.280593,True,False,True,False,False,True,False,False
2,26.0,0,0,2.188856,True,False,False,False,True,False,False,True
3,35.0,1,0,3.990834,True,False,True,False,False,False,False,True
4,35.0,0,0,2.202765,False,True,False,False,True,False,False,True


## Linear Model with OLS

Remember: 
$
(X'X)^{-1} X'y
$

In [8]:
ones = np.ones( (len(t_indep),1) )
X = np.array(t_indep)/np.array(t_indep).max(axis=0) 
#X = np.vstack( (ones,X) ) #[np.ones(len(t_indep))
X_ = np.append( ones ,X ,axis=1)
y = np.array(t_dep)

ols_coeffs = np.linalg.inv(X_.T.dot(X_)).dot(X_.T.dot(y))
ols_coeffs = dict(zip( ["const"]+cols , ols_coeffs))
ols_coeffs

{'const': -356472154361620.25,
 'Age': -0.3916277742165166,
 'SibSp': -1.7176744764188352,
 'Parch': -0.652258897770639,
 'LogFare': 5.432247480076669,
 'Sex_female': 1590430026047686.0,
 'Sex_male': 1590430026047685.5,
 'Pclass_1': -1233957871686064.0,
 'Pclass_2': -1233957871686067.0,
 'Pclass_3': -1233957871686066.0,
 'Embarked_C': 2.5,
 'Embarked_Q': 2.5,
 'Embarked_S': 2.75}

In [34]:
from sklearn import linear_model
import statsmodels.api as sm

reg = linear_model.LinearRegression()
reg.fit(X,y)
#LinearRegression()
print(len(reg.coef_))
sk_coeffs = dict(zip( cols , reg.coef_))
sk_coeffs

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

12
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.400
Model:                            OLS   Adj. R-squared:                  0.394
Method:                 Least Squares   F-statistic:                     65.35
Date:                Fri, 01 Mar 2024   Prob (F-statistic):           9.56e-92
Time:                        17:47:05   Log-Likelihood:                -394.14
No. Observations:                 891   AIC:                             808.3
Df Residuals:                     881   BIC:                             856.2
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2524      0.036      6.982      

In [None]:
#import seaborn as sbs
#sbs.pairplot(df[cols])

In [None]:
# normalizing the data to 0-1
t_indep = t_indep/t_indep.max(dim=0).values
t_indep

## Setting up the NN 

In [13]:
torch.manual_seed(442)

n_coeff = t_indep.shape[1]
coeffs = torch.rand(n_coeff)-0.5
coeffs

tensor([-0.4629,  0.1386,  0.2409, -0.2262, -0.2632, -0.3147,  0.4876,  0.3136,
         0.2799, -0.4392,  0.2103,  0.3625])

In [14]:
preds = (t_indep*coeffs).sum(axis=1)
loss = torch.abs(preds-t_dep).mean()
loss

tensor(13.8746)

In [15]:
def calc_preds(coeffs, indeps): return (indeps*coeffs).sum(axis=1)
def calc_loss(coeffs, indeps, deps): return torch.abs(calc_preds(coeffs, indeps)-deps).mean()

In [16]:
coeffs.requires_grad_()

tensor([-0.4629,  0.1386,  0.2409, -0.2262, -0.2632, -0.3147,  0.4876,  0.3136,
         0.2799, -0.4392,  0.2103,  0.3625], requires_grad=True)

In [17]:
loss = calc_loss(coeffs, t_indep, t_dep)
loss

tensor(13.8746, grad_fn=<MeanBackward0>)

In [18]:
loss = calc_loss(coeffs, t_indep, t_dep)
loss.backward()
with torch.no_grad():
    coeffs.sub_(coeffs.grad * 0.1)
    coeffs.grad.zero_()
    print(calc_loss(coeffs, t_indep, t_dep))

tensor(68.7940)


## training the model

In [19]:
from fastai.data.transforms import RandomSplitter
trn_split,val_split=RandomSplitter(seed=42)(df)

trn_indep,val_indep = t_indep[trn_split],t_indep[val_split]
trn_dep,val_dep = t_dep[trn_split],t_dep[val_split]
len(trn_indep),len(val_indep)

(713, 178)

In [20]:
def update_coeffs(coeffs, lr):
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()

In [21]:
def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, trn_indep, trn_dep)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")

In [22]:
def init_coeffs(): return (torch.rand(n_coeff)-0.5).requires_grad_()

In [23]:
def train_model(epochs=30, lr=0.01):
    torch.manual_seed(442)
    coeffs = init_coeffs()
    for i in range(epochs): one_epoch(coeffs, lr=lr)
    return coeffs

In [24]:
coeffs = train_model(18, lr=0.2)

13.853; 150.990; 13.853; 150.990; 13.853; 150.990; 13.853; 150.990; 13.853; 150.990; 13.853; 150.990; 13.853; 150.990; 13.853; 150.990; 13.853; 150.990; 

In [25]:
def show_coeffs(): return dict(zip(cols, coeffs.requires_grad_(False)))
show_coeffs()

{'Age': tensor(-0.4629),
 'SibSp': tensor(0.1386),
 'Parch': tensor(0.2409),
 'LogFare': tensor(-0.2262),
 'Sex_female': tensor(-0.2632),
 'Sex_male': tensor(-0.3147),
 'Pclass_1': tensor(0.4876),
 'Pclass_2': tensor(0.3136),
 'Pclass_3': tensor(0.2799),
 'Embarked_C': tensor(-0.4392),
 'Embarked_Q': tensor(0.2103),
 'Embarked_S': tensor(0.3625)}

In [33]:
sk_coeffs

{'Age': -0.43796274,
 'SibSp': -0.42575407,
 'Parch': -0.15299387,
 'LogFare': 0.36158663,
 'Sex_female': 0.24832955,
 'Sex_male': -0.24832933,
 'Pclass_1': 0.12090873,
 'Pclass_2': 0.02524759,
 'Pclass_3': -0.1461561,
 'Embarked_C': 0.020932771,
 'Embarked_Q': 0.015777865,
 'Embarked_S': -0.036710616}