# 0. Import Libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

pd.plotting.register_matplotlib_converters()

In [2]:
iris = pd.read_csv("../iris.csv")

### Data Basic Info

In [3]:
iris.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
iris.columns

Index(['sepal length in cm', 'sepal width in cm', 'petal length in cm',
       'petal width in cm', 'class'],
      dtype='object')

# 1. Data Preparation

### Encoding Target Column Values

In [5]:
# See https://datascience.stackexchange.com/questions/39317/difference-between-ordinalencoder-and-labelencoder
# to understand why do we need to use LabelEncoder here

# represent class names in numerical format
LE = LabelEncoder()
iris['code'] = LE.fit_transform(iris['class'])

# prediction target
y = iris['code'].values
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [6]:
display(iris)

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class,code
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,2
146,6.3,2.5,5.0,1.9,Iris-virginica,2
147,6.5,3.0,5.2,2.0,Iris-virginica,2
148,6.2,3.4,5.4,2.3,Iris-virginica,2


### Convert to Numpy Array

In [7]:
FEATURE_COUNT = 4
HIDDEN_LAYER_NODE = 4
OUTPUT_LAYER_NODE = 3

In [8]:
data = np.array(iris)
m, n = data.shape
print(f'm: {m}, n: {n}')

np.random.shuffle(data)

m: 150, n: 6


### Train and Validation data split

In [9]:
X = data[:, 0:FEATURE_COUNT]
y = data[:, n-1]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Neural Network

### Neural Network Setup

![Iris Neural Network](./img/iris_neural_network_overview.png)

We will be using a 2-layer Neural Network for this problem.


**First Layer**  : Input Layer  ('sepal length in cm', 'sepal width in cm', 'petal length in cm','petal width in cm')  
**Second Layer** : Hidden Layer  
**Third Layer**  : Output Layer  ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')

### Representing Our Data
For most ML projects, we will convert the input into a 2d vector with the size of $mxn$, where $n$ represents the total number of data and $m$ represents the total number of features.

Transposing our input vector into $nxm$ will make our vector dot product easier.

The dot product (Z) operation that we are going to use is:
$$
Z^{[1]} = W^{[1]}X + b^{[1]}
$$

```
W: Weight  
b: Bias
```

Multiplying W and X will yield a 
```
W (4, 4) . X (4 x m) = (4 x m) matrix
```

Although, the bias term has the dimension of `1 x m`, we are going to apply the bias term to all of the vector X columns.

### Non-Linear Activation Function
Subsequently, we need to use a non-linear activation function, so that it does not behave like linear regression.

In this case, we will be using ReLu (Rectified Linear Unit).

<img src="./img/iris_relu.png" alt="Rectified Linear Unit - ReLu" width="500px" align="left"/>

In [10]:
train_X_transpose = train_X.T
train_y_transpose = train_y.T 

In [21]:
def init_params():
    W1 = np.random.rand(HIDDEN_LAYER_NODE, FEATURE_COUNT)
    b1 = np.random.rand(HIDDEN_LAYER_NODE, 1)
    
    W2 = np.random.rand(OUTPUT_LAYER_NODE, HIDDEN_LAYER_NODE)
    b2 = np.random.rand(OUTPUT_LAYER_NODE, 1)
    
    return W1, b1, W2, b2

def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    
    return Z1, A1, Z2, A2

def backward_prop():
    

def ReLu(Z):
    return np.maximum(Z,0)

def softmax(Z):
    return np.exp(Z) / np.sum(np.exp(Z))

s = np.exp([[1,2],[3,-1]]) / np.sum(np.exp([[1,2],[3,-1]]))
display(s)

array([[0.08894682, 0.24178252],
       [0.65723302, 0.01203764]])

In [12]:
m

150

In [13]:
n

6

# TO BE CONTINUED