In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("./Data Files/insurance_data2.csv")
df.head()

Unnamed: 0,age,affordibility,bought_insurance
0,22,1,0
1,25,0,0
2,47,1,1
3,52,0,0
4,46,1,1


In [3]:
from sklearn.model_selection import train_test_split
X = df[['age', 'affordibility']]
y = df.bought_insurance
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=25)

In [4]:
X_train_scaled = X_train.copy()
X_train_scaled.age = X_train_scaled.age / 100

X_test_scaled = X_test.copy()
X_test_scaled.age = X_test_scaled.age / 100

X_test_scaled

Unnamed: 0,age,affordibility
2,0.47,1
10,0.18,1
21,0.26,0
11,0.28,1
14,0.49,1
9,0.61,1


In [5]:
model = keras.Sequential([
    keras.layers.Dense(
        1, 
        input_shape=(2, ), 
        activation='sigmoid',
        kernel_initializer='ones',
        bias_initializer='zeros'
    )
])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(X_train_scaled, y_train, epochs=5000, verbose=0)

<keras.callbacks.History at 0x1bf12f15220>

In [6]:
model.predict(X_test_scaled)



array([[0.7054848 ],
       [0.3556957 ],
       [0.16827832],
       [0.47801185],
       [0.7260696 ],
       [0.8294983 ]], dtype=float32)

In [7]:
coef, intercept = model.get_weights()
coef, intercept

(array([[5.060863 ],
        [1.4086521]], dtype=float32),
 array([-2.913703], dtype=float32))

In [8]:
def sigmoid(x):
    import math
    return 1 / (1 + math.exp(-x))

In [9]:
def log_loss(y_true, y_predicted):
    epsilon = 1e-15
    y_predicted_new = [max(i,epsilon) for i in y_predicted]
    y_predicted_new = [min(i,1-epsilon) for i in y_predicted_new]
    y_predicted_new = np.array(y_predicted_new)
    return -np.mean(y_true*np.log(y_predicted_new)+(1-y_true)*np.log(1-y_predicted_new))

In [10]:
def sigmoid_numpy(X):
   return 1/(1+np.exp(-X))

In [11]:
class NeuralNetwork:
    def __init__(self):
        self.weights = []
        self.bias = 0
    
    def fit(self, X, y, epochs, loss_threshold):
        self.weights, self.bias = self.gradient_descent(X, y, epochs, loss_threshold)
        
    def predict(self, X):
        
        X = [X[X.columns[i]] for i in range(len(X.columns))]
        
        # weighted sum
        weighted_sum = 0
        
        for i in range(len(X)):
            weighted_sum += self.weights[i]*X[i]
                
        weighted_sum += self.bias
            
        return sigmoid_numpy(weighted_sum)
        
    
    def gradient_descent(self, X, y, epochs, loss_threshold):
        X = [X[X.columns[i]] for i in range(len(X.columns))]
        n = len(X)
        w = [1 for i in range(n)]
        b = 0
        r = 0.5
        
        for i in range(epochs):
            # weighted sum
            weighted_sum = 0
            
            for j in range(n):
                weighted_sum += w[j]*X[j]
                
            weighted_sum += b
            
            # y hat
            y_hat = sigmoid_numpy(weighted_sum)
            
            # loss
            loss = log_loss(y, y_hat)
            
            # derivatives
            wd = [(1/len(X[j]))*np.dot(np.transpose(X[j]), (y_hat-y)) for j in range(n)]
            bd = np.mean(y_hat-y)
            
            # reassign
            w = [w[j] - r * wd[j] for j in range(n)]
            b = b - r * bd
            
            # print & break
            
            if i%50 == 0:
                print(f'Epoch:{i}, loss:{loss}')
            
            if loss<=loss_threshold:
                print(f'Epoch:{i}, loss:{loss}')
                break
        
        return  w, b
            

In [12]:
scratch_model = NeuralNetwork()
scratch_model.fit(X_train_scaled, y_train, epochs=500, loss_threshold=0.4631)

Epoch:0, loss:0.7113403233723417
Epoch:50, loss:0.5675865113475955
Epoch:100, loss:0.5390680417774752
Epoch:150, loss:0.5176462164249293
Epoch:200, loss:0.5005011269691375
Epoch:250, loss:0.48654089537617085
Epoch:300, loss:0.4750814640632793
Epoch:350, loss:0.46561475306999006
Epoch:366, loss:0.46293944095888917


In [13]:
solved_weights = pd.DataFrame(data={
    'scratch weights': scratch_model.weights,
    'tensorflow weights': [coef[i][0] for i in range(len(coef))]
})
solved_weights

Unnamed: 0,scratch weights,tensorflow weights
0,5.051048,5.060863
1,1.456979,1.408652


In [14]:
solved_biases = pd.DataFrame(data={
    'scratch bias': scratch_model.bias,
    'tensorflow bias': intercept
})
solved_biases

Unnamed: 0,scratch bias,tensorflow bias
0,-2.959653,-2.913703


In [15]:
scratch_predicted = scratch_model.predict(X_test_scaled)

In [16]:
tf_predicted = model.predict(X_test_scaled)



In [17]:
predictions = X_test.copy()
predictions['actual'] = y_test
predictions['scratch prediction'] = scratch_predicted
predictions['tensorflow prediction'] = tf_predicted
predictions

Unnamed: 0,age,affordibility,actual,scratch prediction,tensorflow prediction
2,47,1,1,0.70502,0.705485
10,18,1,0,0.355836,0.355696
21,26,0,0,0.161599,0.168278
11,28,1,0,0.477919,0.478012
14,49,1,1,0.725586,0.72607
9,61,1,1,0.828987,0.829498


### Gradient Descent Algorithm

$$
\textbf{Randomly assigned: } \\
r = 0.5, w_1 = 1, w_2 = 1, b = 0 \qquad \textit{learning rate, weight 1, weight 2, bias}
\\~\\
\textbf{Given: } \\
x_1 = \begin{bmatrix} ... \end{bmatrix}, x_2 = \begin{bmatrix} ... \end{bmatrix} \\ 
y = \begin{bmatrix} ... \end{bmatrix} \\
n \qquad \textit{Number of samples}
\\~\\
\textbf{An epoch: } \\
w\Sigma = w_1 \cdot x_1 + w_2 \cdot x_2 + bias \\
\hat{y} = \sigma(w\Sigma) \qquad \textit{Activation function} \\
loss = logLoss(y, \hat{y}) \qquad \textit{Loss function} \\
\\~\\
\textbf{Derivatives:} \\
\frac{\partial}{\partial w_1} = \frac{x_1^T \cdot (\hat{y} - y)}{n} \\
\frac{\partial}{\partial w_2} = \frac{x_2^T \cdot (\hat{y} - y)}{n} \\
\frac{\partial}{\partial b} = \mu(\hat{y} - y) \qquad \textit{Mean function} \\~\\
\textbf{Reassign values: } \\
w_1 = w_1 - r \cdot \frac{\partial}{\partial w_1} \\
w_2 = w_2 - r \cdot \frac{\partial}{\partial w_2} \\
b = b - r \cdot \frac{\partial}{\partial b}
$$