# DOMAIN: Electronics and Telecommunication
## • CONTEXT: A communications equipment manufacturing company has a product which is responsible for emitting informative signals.
## Company wants to build a machine learning model which can help the company to predict the equipment’s signal quality using various parameters.

### • DATA DESCRIPTION: The data set contains information on various signal tests performed:
1. Parameters: Various measurable signal parameters.
2. Signal_Quality: Final signal strength or quality

## • PROJECT OBJECTIVE: To build a classifier which can use the given parameters to determine the signal strength or quality.

In [None]:
#Part A - Q1 A - Read the ‘Signals.csv’ as DatFrame and import required libraries.
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats 
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
from tensorflow.keras import optimizers
from keras.layers import Dense, Activation, LeakyReLU,ReLU
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

#Read the data as a data frame
nn_signal = pd.read_csv('NN Project Data - Signal.csv')

In [None]:
nn_signal.head(20)

In [None]:
#Part A - Q1 B - Check for missing values and print percentage for each attribute.
nn_signal.isnull().mean() * 100

In [None]:
# Note: No missing values in data

In [None]:
nn_signal.describe().T

In [None]:
# count for all parmeters have 1599 records hence proces no missing values

In [None]:
#Part A - Q1 C - Check for presence of duplicate records in the dataset and impute with appropriate method.

nn_signal_duplicate = nn_signal[nn_signal.duplicated()]
 
print("List of Duplicate Rows:")
# Lis tof duplicate rows in dataframe
nn_signal_duplicate

In [None]:
#240 row are duplicate

In [None]:
# Drop duplicate rows
nn_signal.drop_duplicates(keep='first',inplace=True)

In [None]:
nn_signal

In [None]:
#Part A - Q1 D - Visualise distribution of the target variable.
sns.countplot(nn_signal['Signal_Strength']) 

In [None]:
#Part A - Q1 E - Share insights from the initial data analysis (at least 2).
# There are 6 categories of signal to be predicted from label 3 to 8 
# Class 5 and Class 6 has highest count
# 240 duplicate rows were present in data that were removed.

In [None]:
#Part A - Q2 A - Split the data into X & Y.
X = nn_signal.drop('Signal_Strength', axis=1)
y = nn_signal.pop('Signal_Strength')
X.head()

In [None]:
y.head()

In [None]:
#Part A - Q2 B - Split the data into train & test with 70:30 proportion
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=11)

In [None]:
y_train

In [None]:
# A - Q2 C - Print shape of all the 4 variables and verify if train and test data is in sync.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
# All classes presentin test and train datasets

In [None]:
#Part A - Q2 D - Normalise the train and test data with appropriate method.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Part A - Q2 E - Transform Labels into format acceptable by Neural Network
y_train[0]

In [None]:
y_train = tensorflow.keras.utils.to_categorical(y_train, num_classes=9)
y_test = tensorflow.keras.utils.to_categorical(y_test, num_classes=9)

In [None]:
y_train[0]

In [None]:
#Part A - Q3 A - Design a Neural Network to train a classifier.

# splitting data for  train and validation of categorial 
Xc_train, Xc_val, yc_train, yc_val = train_test_split(X_train, y_train, test_size=.20, random_state=11)

#Initialize Sequential model
model = tensorflow.keras.models.Sequential()

#Input
model.add(tensorflow.keras.layers.Dense(128,kernel_initializer='normal', activation='sigmoid'))
#HL 2
model.add(tensorflow.keras.layers.Dense(64,kernel_initializer='normal', activation='sigmoid'))
#HL 2
model.add(tensorflow.keras.layers.Dense(32,kernel_initializer='normal', activation='sigmoid'))
model.add(LeakyReLU(alpha=0.1))
#Output
model.add(tensorflow.keras.layers.Dense(9, kernel_initializer='normal',activation='softmax'))

#Compile the model
model.compile(optimizer='sgd',loss='mean_absolute_error', metrics=['accuracy'])

In [None]:
#Part A - Q3 B - Train the classifier using previously designed Architecture
EPOCH=300
model_cal=model.fit(x=Xc_train, y=yc_train, batch_size=30, epochs= EPOCH, validation_data=(Xc_val, yc_val))

In [None]:
#Part A - Q3 C - Plot 2 separate visuals. 
#i. Training Loss and Validation Loss 
loss_train = model_cal.history['loss']
loss_val = model_cal.history['val_loss']
epochs = range(1,EPOCH+1)
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
#ii. Training Accuracy and Validation Accuracy
Acc_train = model_cal.history['accuracy']
Acc_val = model_cal.history['val_accuracy']
epochs = range(1,EPOCH+1)
plt.plot(epochs, Acc_train, 'g', label='Training accuracy')
plt.plot(epochs, Acc_val, 'b', label='validation accuracy')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
#Part A - Q3 D - Design new architecture/update existing architecture 
#in attempt to improve the performance of the model.

#Initialize Sequential model
model = tensorflow.keras.models.Sequential()
#Input
model.add(tensorflow.keras.layers.Dense(128,kernel_initializer='normal', activation='relu'))
#HL1
model.add(tensorflow.keras.layers.Dense(64,kernel_initializer='normal', activation='relu'))
#HL2
model.add(tensorflow.keras.layers.Dense(32,kernel_initializer='normal', activation='relu'))
model.add(LeakyReLU(alpha=0.1))
#HL3
model.add(tensorflow.keras.layers.Dense(16,kernel_initializer='normal', activation='relu'))
model.add(LeakyReLU(alpha=0.1))
#OUTPUT layer
model.add(tensorflow.keras.layers.Dense(9, kernel_initializer='normal',activation='softmax'))

In [None]:
#Compile the model
model.compile(optimizer='adam',loss='mean_absolute_error', metrics=['accuracy'])

In [None]:
EPOCH=300
model_cal=model.fit(x=Xc_train, y=yc_train, batch_size=32, epochs= EPOCH, validation_data=(Xc_val, yc_val))

In [None]:
#Part A - Q3 E - Plot visuals as in Q3.C and share insights about difference observed in both the models.
loss_train = model_cal.history['loss']
loss_val = model_cal.history['val_loss']
epochs = range(1,EPOCH+1)
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training & Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
Acc_train = model_cal.history['accuracy']
Acc_val = model_cal.history['val_accuracy']
epochs = range(1,EPOCH+1)
plt.plot(epochs, Acc_train, 'g', label='Training accuracy')
plt.plot(epochs, Acc_val, 'b', label='Validation accuracy')
plt.title('Training & Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
# Observations:
# 2nd model is better using ADAM as the optimsier and RELU as activation function instead of SGD ( stocastic gradient descent)
# Accuracy of the model increased from 42% to 62% when RELU was used as the activation function instead of sigmoid

# DOMAIN: Autonomous Vehicles
## • CONTEXT: A Recognising multi-digit numbers in photographs captured at street level is an important component of modern-day map making. A classic example of a corpus of such street-level photographs is Google’s Street View imagery composed of hundreds of millions of geo-located 360-degree panoramic images. The ability to automatically transcribe an address number from a geo-located patch of pixels and associate the transcribed number with a known street address helps pinpoint, with a high degree of accuracy, the location of the building it represents. More broadly, recognising numbers in photographs is a problem of interest to the optical character recognition community. While OCR on constrained domains like document processing is well studied, arbitrary multi-character text recognition in photographs is still highly challenging. This difficulty arises due to the wide variability in the visual appearance of text in the wild on account of a large range of fonts, colours, styles, orientations, and character arrangements. The recognition problem is further complicated by environmental factors such as lighting, shadows, specularity, and occlusions as well as by image acquisition factors such as resolution, motion, and focus blurs. In this project, we will use the dataset with images centred around a single digit (many of the images do contain some distractors at the sides). Although we are taking a sample of the data which is simpler, it is more complex than MNIST because of the distractors.

## • DATA DESCRIPTION: The SVHN is a real-world image dataset for developing machine learning and object recognition algorithms with the minimal requirement on data formatting but comes from a significantly harder, unsolved, real-world problem (recognising digits and numbers in natural scene images). SVHN is obtained from house numbers in Google Street View images.

## Where the labels for each of this image are the prominent number in that image i.e. 2,6,7 and 4 respectively. The dataset has been provided in the form of h5py files. You can read about this file format here: https://docs.h5py.org/en/stable/
## Acknowledgement: Yuval Netzer, Tao Wang, Adam Coates, Alessandro Bissacco, Bo Wu, Andrew Y. Ng Reading Digits in Natural Images with Unsupervised Feature Learning NIPS Workshop on Deep Learning and Unsupervised Feature Learning 2011. PDF
## http://ufldl.stanford.edu/housenumbers as the URL for this

In [None]:
import numpy as np 
import pandas as pd 
import json
import tensorflow as tf
import keras as kr
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.utils.np_utils import to_categorical
from keras.layers import Activation, Dense
from keras.layers import BatchNormalization, Dropout
from keras import optimizers
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten
import h5py

In [None]:
#Part B - Q1 A - Read the .h5 file and assign to a variable.
datah5py=h5py.File('Autonomous_Vehicles_SVHN_single_grey1.h5','r')

In [None]:
#Part B - Q1 B - Print all the keys from the .h5 file.
datah5py.keys()

In [None]:
#Part B - Q1 C - Split the data into X_train, X_test, Y_train, Y_test
X_train=datah5py['X_train']
X_test=datah5py['X_test']
X_val=datah5py['X_val']
y_train=datah5py['y_train']
y_test=datah5py['y_test']
y_val=datah5py['y_val']

In [None]:
#Part B - Q2 A - Print shape of all the 4 data split into x, y, train, test to verify if x & y is in sync.

print("Training data   X_train :", X_train.shape)
print("Testing data    X_test  :", X_test.shape)
print("Validation data X_val   :", X_val.shape)
print("Training data   y_train :", y_train.shape)
print("Testing data    y_test  :", y_test.shape)
print("Validation data y_val :", y_val.shape)

In [None]:
#Classes are balanced in each dataset

In [None]:
#Part B - Q2 B - Visualise first 10 images in train data and print its corresponding labels.
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 1))
for img in range(10):
    plt.subplot(1, 10, img+1)
    plt.imshow(X_train[img].reshape(32,32),cmap='gray')
    plt.axis('off')
plt.show()
print('Image Labes: %s' % (y_train[0:10]))

In [None]:
#Part B - Q2 C -Reshape all the images with appropriate shape update the data in same variable.
X_val = np.asarray(X_val).reshape(60000,1024)
X_train = np.asarray(X_train).reshape(42000,1024)
X_test = np.asarray(X_test).reshape(18000,1024)

In [None]:
#Part B - Q2 D - Normalise the images i.e. Normalise the pixel values.
X_train = X_train / 255.0
X_test = X_test / 255.0
X_val =X_val /255.0

In [None]:
#Part B - Q2 E - Transform Labels into format acceptable by Neural Network
y_train = tensorflow.keras.utils.to_categorical(y_train, num_classes=10)
y_test = tensorflow.keras.utils.to_categorical(y_test, num_classes=10)
y_val = tensorflow.keras.utils.to_categorical(y_val, num_classes=10)

In [None]:
#Part B - Q2 F - Print total Number of classes in the Dataset.
print("Dataset shape:")
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

num_classes = y_test.shape[1] 
print("Classes in dataset:",num_classes)

In [None]:
#Part B - Q3 A - Design a Neural Network to train a classifier.
model = Sequential()
#Input layer and activation functions ReLU
model.add(Dense(512, activation="relu", kernel_initializer='he_uniform',input_shape = (1024, )))
#HL 1
model.add(Dense(128, activation="relu", kernel_initializer='he_uniform'))
#HL 2
model.add(Dense(64, activation="relu", kernel_initializer='he_uniform'))
#HL 3
model.add(Dense(32,  activation="relu",  kernel_initializer='he_uniform'))
# Output Layer
model.add(Dense(10, kernel_initializer='he_normal', activation='softmax'))

#Compile the model
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
#Part B - Q3 B - Train the classifier using previously designed Architecture (Use best suitable parameters).
result = model.fit(X_train, y_train, validation_data=(X_val,y_val),batch_size = 300, epochs = 100, verbose = 1)

In [None]:
#Part B - Q3 C - Evaluate performance of the model with appropriate metrics.
scores = model.evaluate(X_test, y_test, verbose=0)
print("Loss:", scores[0])
print("Accuracy:", scores[1])

In [None]:
#Part B - Q3 D - Plot the training loss, validation loss vs number of epochs and training accuracy, 
#validation accuracy vs number of epochs plot and write your observations on the same.
accuracy      = result.history['accuracy']
val_accuracy  = result.history['val_accuracy']
loss     = result.history['loss']
val_loss = result.history['val_loss']

epochs   = range(len(accuracy)) # Get number of epochs

plt.plot  ( epochs, accuracy, label = 'Training accuracy')
plt.plot  ( epochs, val_accuracy, label = 'Validation accuracy')
plt.title ('Training & validation Accuracy')
plt.legend(loc = 'lower right')
plt.figure()

plt.plot  ( epochs, loss, label = 'Training loss')
plt.plot  ( epochs, val_loss, label = 'Validation loss')
plt.legend(loc = 'upper right')
plt.title ('Training & validation loss')

Observation : The model has

Training Accuracy : 89%
Validation Accuracy : 87%
Testing accuracy : 82%

Which means that some overfitting is there but not much 
Adam optimiser produced good results. Relu activation also helped to get high accuracy.