## Dauren Tursynbek 
## Machine Learning Learning

# Practical Task on Ensemble Learning

In [6]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pandas as pd
from tensorflow.keras import  Model
from tensorflow.keras.layers import Conv2DTranspose,Conv2D, concatenate, Input,BatchNormalization

##### **Malware detection refers to the process of detecting the presence of malware on a host system or of distin-guishing whether a specific program is 'malicious' or 'benign'. In this task, you will use some network layer features such as Duration, Number of packets, etc. to build a machine learning classification model that will detect Android malware applications, using app features.**
**Read data in Python. Split your data into train and test sets (80% and 20% respectively).**

In [7]:
# importing dataset files
data = pd.read_csv('android_traffic.csv')
# defining X with values of first columns
X = data.iloc[:,:10]
# defining Y with values of last column
y = data[['type']].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

**Create the following three models:**
* RandomForestClassifier(max depth=15)
* BaggingClassifier(base estimator=DecisionTreeClassifier(max depth=15))
* AdaBoostClassifier(base estimator=DecisionTreeClassifier(max depth=15))

In [8]:
# creation of RandomForestClassifier
random_forest_classifier = RandomForestClassifier(max_depth = 15)
decision_tree_classifier = DecisionTreeClassifier(max_depth = 15)
# creation of BaggingClassifier
bagging_classifier = BaggingClassifier(base_estimator=decision_tree_classifier)
#creation of AdaBoostClassifier
ada_boost_classifier = AdaBoostClassifier(base_estimator=decision_tree_classifier)

**Tune the following hyper-parameters of the estimators in all ensemble models using grid search:**
* n estimators
* max features -> for the base estimators
* min impurity decrease -> for the base estimators

In [9]:
# tuning the parameters for RandomForestClassifier
param_grid = {'n_estimators': np.linspace(10, 200,num=15,dtype=int),
              'max_features': list(range(1, 5)),
              'min_impurity_decrease': [0.0001,0.00025,0.0005,0.001,0.0025,0.005,0.01]}
# creation of grid search and fit it
grid_search_clf = GridSearchCV(estimator = random_forest_classifier
                               , param_grid=param_grid, cv = 5, n_jobs = -1).fit(X_train,y_train)
# saving best parameters
random_forest_classifier_best = grid_search_clf.best_params_
print(random_forest_classifier_best)

  self.best_estimator_.fit(X, y, **fit_params)


{'max_features': 4, 'min_impurity_decrease': 0.0001, 'n_estimators': 159}


In [None]:
# tuning the parameters for BaggingClassifier
param_grid = {'n_estimators': np.linspace(10, 200,num=15,dtype=int),
              'base_estimator__max_features': list(range(1, 5)),
              'base_estimator__min_impurity_decrease': [0.0001,0.00025,0.0005,0.001,0.0025,0.005,0.01]}
# creation of grid search and fit it
grid_search_clf = GridSearchCV(estimator = bagging_classifier
                               , param_grid=param_grid, cv = 5, n_jobs = -1).fit(X_train,y_train)
# saving best parameters
bagging_classifier_best = grid_search_clf.best_params_
print(bagging_classifier_best)

In [None]:
# tuning the parameters for AdaBoostClassifier
param_grid = {'n_estimators': np.linspace(10, 200,num=15,dtype=int),
              'base_estimator__max_features': list(range(1, 5)),
              'base_estimator__min_impurity_decrease': [0.0001,0.00025,0.0005,0.001,0.0025,0.005,0.01]}
# creation of grid search and fit it
grid_search_clf = GridSearchCV(estimator = ada_boost_classifier
                               , param_grid=param_grid, cv = 5, n_jobs = -1).fit(X_train,y_train)
# saving best parameters
ada_boost_classifier_best = grid_search_clf.best_params_
print(ada_boost_classifier_best)

**Create the final models using the best values of the hyper-parameters and evaluate your models on the test set. Which model performed the best on the test set? Why do you think that is the case?**

In [None]:
#Random Forest model with best hyper-parameters
n_estimators_best = random_forest_classifier_best['n_estimators']
max_features_best = random_forest_classifier_best['max_features']
min_impurity_best = random_forest_classifier_best['min_impurity_decrease']
random_forest_best = RandomForestClassifier(n_estimators=n_estimators_best, max_depth=15
                                       ,max_features=max_features_best,min_impurity_decrease=min_impurity_best).fit(X_train,y_train)
y_pred = random_forest_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
#Bagging Classifier model with best hyper-parameters
n_estimators_best1 = bagging_classifier_best['n_estimators']
max_features_best1 = bagging_classifier_best['base_estimator__max_features']
min_impurity_best1 = bagging_classifier_best['base_estimator__min_impurity_decrease']
bagging_tree = DecisionTreeClassifier(max_depth=15, max_features=max_features_best1,min_impurity_decrease=min_impurity_best1)
bagging_best = BaggingClassifier(base_estimator=bagging_tree, n_estimators=n_estimators_best1).fit(X_train,y_train)
y_pred_bag = bagging_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_bag)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
#AdaBoost Classifier model with best hyper-parameters
n_estimators_best2 = ada_boost_classifier_best['n_estimators']
max_features_best2 = ada_boost_classifier_best['base_estimator__max_features']
min_impurity_best2 = ada_boost_classifier_best['base_estimator__min_impurity_decrease']
ada_tree = DecisionTreeClassifier(max_depth=15, max_features=max_features_best2,min_impurity_decrease=min_impurity_best2)
ada_boost_best = AdaBoostClassifier(base_estimator=ada_tree, n_estimators=n_estimators_best2).fit(X_train,y_train)
y_pred_ada = ada_boost_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_ada)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

According to the result of models we see that best preformance is made by AdaBoostClassifier.
It might be caused because of the most suitable depth for Ada Boost, comparing to other classifiers.
It is probable, that on other depths AdaBoost will perform worse. Also, I want to mention that
difference of accuracy between classifiers isn't so big, due to good hyper-parameters.

**Answer this question before doing the next part. If you fine-tuned the hyper-parameter max depth as
well, which of the three ensemble models would you expect to have deeper (larger max depth value)
base learners and which would have shallower base learners? Why do you think that would be the
case?**

Answer:
In my opinion, according to the results of previous task AdaBoost is going to have a shallower max_depth,
because it performed worse in comparison with other models on max_depth=15. On the other hand, probably
RandomForest and Bagging Classifiers are going to be more successeful on deeper max_depth values. The reason
for that is AdaBoost becoming more overfitted on higher max_depthes, as in previous tasks.

**Initialize the models with the best parameters you got from the third step. Fine tune max depth from
5 to 25. Draw 3 plots on the same graph. Put the max depth parameter on the horizontal axis and
the cross validation accuracy of your ensemble models on the vertical axis. Do the results agree with
your answer in the previous part?**

In [None]:
max_depth_range = list(range(5, 26))
cross_val_Random_forest = []
cross_val_Bagging = []
cross_val_AdaBoost = []
for i in max_depth_range:
    random_forest_classifier_new = RandomForestClassifier(n_estimators=n_estimators_best, max_depth=i
                                       ,max_features=max_features_best,min_impurity_decrease=min_impurity_best)
    random_graph = np.mean(cross_val_score(random_forest_classifier_new, X_train, y_train,
                             cv=5, scoring='accuracy'))
    cross_val_Random_forest.append(random_graph)
    bagging_tree_new = DecisionTreeClassifier(max_depth=i, max_features=max_features_best1,min_impurity_decrease=min_impurity_best1)
    bagging_classifier_new = BaggingClassifier(base_estimator=bagging_tree_new, n_estimators=n_estimators_best1)
    bagging_graph = np.mean(cross_val_score(bagging_classifier_new, X_train, y_train,
                             cv=5, scoring='accuracy'))
    cross_val_Bagging.append(bagging_graph)
    ada_tree_new = DecisionTreeClassifier(max_depth=i, max_features=max_features_best2,min_impurity_decrease=min_impurity_best2)
    ada_boost_classifier_new = AdaBoostClassifier(base_estimator=ada_tree_new, n_estimators=n_estimators_best2)
    ada_graph = np.mean(cross_val_score(ada_boost_classifier_new, X_train, y_train,
                             cv=5, scoring='accuracy'))
    cross_val_AdaBoost.append(ada_graph)

plt.xlabel('max_depth')
plt.ylabel('Cross-validation accuracy')
plt.plot(max_depth_range,cross_val_Random_forest,label='RandomForestClassifier')
plt.plot(max_depth_range,cross_val_Bagging,label='BaggingClassifier')
plt.plot(max_depth_range,cross_val_AdaBoost,label='AdaBoostClassifier')
plt.legend()
plt.show()

**According to results, RandomForest and Bagging Classifiers really neeeded deeper max_depth, however adaboost is working well on small max_depthes, and starts overfitting after**

# Practical Task on CNN

**Preprocess and visualize the dataset:**
##### Download dataset. Description of folders and naming are inside dataset folder in README.txt
##### Read all images and convert them to gray with (cv2.ctvColor())

In [None]:
import cv2

src = 'images/'
images = []
for x in range(1,104):
    for y in range(1,13):
        if(x<10 and y<10):
            gray_photo = cv2.imread(src+'00'+str(x)+'_0'+str(y)+'.png')
            images.append(cv2.cvtColor(gray_photo,cv2.COLOR_BGR2GRAY))
        elif(x<10 and y>=10):
            gray_photo = cv2.imread(src+'00'+str(x)+'_'+str(y)+'.png')
            images.append(cv2.cvtColor(gray_photo,cv2.COLOR_BGR2GRAY))
        elif(x>=10 and x<100 and y<10):
            gray_photo = cv2.imread(src+'0'+str(x)+'_0'+str(y)+'.png')
            images.append(cv2.cvtColor(gray_photo,cv2.COLOR_BGR2GRAY))
        elif(x>=10 and x<100 and y>=10):
            gray_photo = cv2.imread(src+'0'+str(x)+'_'+str(y)+'.png')
            images.append(cv2.cvtColor(gray_photo,cv2.COLOR_BGR2GRAY))
        elif(x>=100 and y<10):
            gray_photo = cv2.imread(src+str(x)+'_0'+str(y)+'.png')
            images.append(cv2.cvtColor(gray_photo,cv2.COLOR_BGR2GRAY))
        elif(x>=100 and y>=10):
            gray_photo = cv2.imread(src+str(x)+'_'+str(y)+'.png')
            images.append(cv2.cvtColor(gray_photo,cv2.COLOR_BGR2GRAY))

**Read annotation for images. It contains eye corners and eye centers of 2 eyes for each image.**

In [None]:
src = 'labels/image_labels.txt'
with open(src) as file_in:
    lines = []
    for line in file_in:
        lines.append(line.split())
print(lines[0])

**Visualize one image, draw eye corners and iris centers on it**

In [None]:
image = images[0].copy()
#direct converting from str to int is not possible because there are irrational numbers
x_left = (int(float(lines[0][1])),int(float(lines[0][2])))
y_left = (int(float(lines[0][5])),int(float(lines[0][6])))
x_right = (int(float(lines[0][7])),int(float(lines[0][8])))
y_right = (int(float(lines[0][11])),int(float(lines[0][12])))
x_irleft = int(float(lines[0][3]))
x_irright = int(float(lines[0][9]))
y_irleft = int(float(lines[0][4]))
y_irright = int(float(lines[0][10]))
image = cv2.rectangle(image, x_left,y_left, color = (250, 255, 219), thickness = 1)
image =cv2.rectangle(image, x_right,y_right, color = (250, 255, 219), thickness = 1)
image = cv2.circle(image, (x_irleft,y_irleft), radius=0, color=(250, 255, 219), thickness=-1)
image = cv2.circle(image, (x_irright,y_irright), radius=0, color=(250, 255, 219), thickness=-1)
plt.imshow(image, cmap = plt.cm.gray)
plt.show()

**Normalize images (divide by 255)**

In [None]:
for i in range(len(images)):
    images[i] = images[i]/255

In [None]:
#Check if normalization went well for image
plt.imshow(images[0], cmap = plt.cm.gray)
plt.show()

**Crop eye regions (and resize if needed) to be (48x48) image with the help of eye corners. Do that
for all images. It should look like Figure3(a)**

In [None]:
small_images = []
image_num = 0
i = 0
while i<len(lines):
    x1 = int(float(lines[i][1]))
    y1 = int(float(lines[i][2]))
    x3 = int(float(lines[i][5]))
    y3 = int(float(lines[i][6]))
    x4 = int(float(lines[i][7]))
    y4 = int(float(lines[i][8]))
    x6 = int(float(lines[i][11]))
    y6 = int(float(lines[i][12]))
    point1 = int((x1+x3)/2-24)
    point1_1 = int((x1+x3)/2+24)
    point2 = int((y1+y3)/2-24)
    point2_1 = int((y1+y3)/2+24)
    point3 = int((x4+x6)/2-24)
    point3_1 = int((x4+x6)/2+24)
    point4 = int((y4+y6)/2-24)
    point4_1 = int((y4+y6)/2+24)
    small_image_left = images[image_num][point2:point2_1,point1:point1_1]
    small_image_right = images[image_num][point4:point4_1,point3:point3_1]
    small_images.append(small_image_left.reshape(48,48,1))
    small_images.append(small_image_right.reshape(48,48,1))
    image_num+=1
    i+=1

In [None]:
#Check if cropping went well for image
plt.imshow(small_images[5],cmap = plt.cm.gray)
plt.show()

**Now data is ready to create a final dataset, which you will use for CNN training.Your labels (Y) are coordinates of eye center for each image in X (don't forget to convert iris
center from whole image coordinate system to coordinate system of eye region). You should make
one more step to cook Y set. For each eye center in Y you should create a (48x48) image (with
zero values) and assign value=1 to pixel which coordinate is an iris center. Do it for all images.
It should look like Figure3(b)**

In [None]:
small_images_of_iris = []
def det_iris(a,b,c,d,e):
    f=[]
    x_ir = abs(c-a)
    y_ir = (48-(abs(d-b)+abs(d-e)))/2
    f.append(x_ir)
    f.append(y_ir)
    return f
image_num_iris = 0
i = 0
while i<len(lines):
    x1 = int(float(lines[i][1]))
    y1 = int(float(lines[i][2]))
    x2 = int(float(lines[i][3]))
    y2 = int(float(lines[i][4]))
    x3 = int(float(lines[i][5]))
    y3 = int(float(lines[i][6]))
    x4 = int(float(lines[i][7]))
    y4 = int(float(lines[i][8]))
    x5 = int(float(lines[i][9]))
    y5 = int(float(lines[i][10]))
    x6 = int(float(lines[i][11]))
    y6 = int(float(lines[i][12]))
    point1 = int(abs(x2-x1))
    point1_1 = int((48-(abs(y2-y1)+abs(y3-y1)))/2)
    point2 = int(abs(x5-x4))
    point2_1 = int((48-(abs(y5-y4)+abs(y6-y4)))/2)
    small_image_iris_left = np.zeros((48,48),dtype=np.uint8)
    small_image_iris_right = np.zeros((48,48),dtype=np.uint8)
    small_image_iris_left[point1_1,point1] = 1
    small_image_iris_left[point1_1-1,point1] = 1
    small_image_iris_left[point1_1+1,point1] = 1
    small_image_iris_left[point1_1,point1-1] = 1
    small_image_iris_left[point1_1,point1+1] = 1
    small_image_iris_right[point2_1,point2] = 1
    small_image_iris_right[point2_1,point2-1] = 1
    small_image_iris_right[point2_1,point2+1] = 1
    small_image_iris_right[point2_1-1,point2] = 1
    small_image_iris_right[point2_1+1,point2] = 1
    small_images_of_iris.append(small_image_iris_left.reshape(48,48,1))
    small_images_of_iris.append(small_image_iris_right.reshape(48,48,1))
    i+=1

In [None]:
#Check if iris is ok
plt.imshow(small_images_of_iris[5],cmap = plt.cm.gray)
plt.show()

**Split dataset**

In [None]:
X2 = np.asarray(small_images,np.ndarray).reshape(len(small_images),48,48,1)
y2 = np.asarray(small_images_of_iris,np.ndarray).reshape(len(small_images_of_iris),48,48,1)
X2 = X2.astype(np.float32)
y2 = y2.astype(np.float32)
print(y2.shape)
print(X2.shape)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X2,y2,test_size=0.2, random_state=0)

**Build a CNN model using Keras. Layers that might be helpful: Conv2, Conv2DTranspose, concatenate.**

In [None]:
layer1 = Input((48,48,1))
layer2 = Conv2D(64, (3,3), strides=(1,1), padding='same', activation= 'relu')(layer1)
layer2 = BatchNormalization()(layer2)
layer3 = Conv2D(128, (3,3), strides=(2,2), padding='same', activation= 'relu')(layer2)
layer3 = BatchNormalization()(layer3)
layer4 = Conv2D(256, (3,3), strides=(2,2), padding='same', activation= 'relu')(layer3)
layer4 = BatchNormalization()(layer4)
layer4_1 = Conv2D(256, (3,3), strides=(1,1), padding='same', activation= 'relu')(layer4)
layer4_1 = Conv2DTranspose(128, (3,3), strides=(2,2), padding='same', activation= 'relu')(layer4_1)
layer4_1 = Conv2DTranspose(64, (3,3), strides=(2,2),padding='same', activation= 'relu')(layer4_1)
layer4_1 = Conv2DTranspose(1, (3,3), strides=(1,1), padding='same', activation= 'relu')(layer4_1)
layer5 = Conv2DTranspose(128, (3,3), strides=(4,4),padding='same', activation= 'relu')(layer4)
layer6 = concatenate(inputs=[layer2,layer5],axis=3)
layer6 = BatchNormalization()(layer6)
layer7 = Conv2DTranspose(64, (3,3),padding='same', activation= 'relu')(layer6)
layer7 = BatchNormalization()(layer7)
layer8 = Conv2DTranspose(1, (3,3),padding='same', activation= 'relu')(layer7)

model = Model(inputs = layer1, outputs = [layer8,layer4_1])
model.summary()
model.compile(optimizer="adam", loss = 'mse')


**Compile and train CNN with different optimizers [sgd, adam, adamax, rmsprop], loss functions [mse,
mae] and activations [tanh, relu, sigmoid]. Report best combination.**

In [None]:
model.fit(X_train1, [y_train1,X_train1], epochs =  15 ,validation_split=0.125)

**Make a prediction for 10 test images. Draw predicted centers on them and visualize it. (You can draw
iris center with cv2.circle())**

In [None]:
y_pred1 = model.predict(X_test1)
for i in range(10):
    dot = 0
    coor1 = 0
    coor2 = 0
    for j in range(len(y_pred1[0][i])):
        for f in range(0,48):
            if(y_pred1[0][i][j][f]>dot):
                dot = y_pred1[0][i][j][f]
                coor1 = j
                coor2 = f
    im_to_show = y_pred1[1][i].copy()
    im_to_show = cv2.circle(im_to_show,(coor2,coor1),1,(0,0,0))
    plt.imshow(im_to_show,cmap = plt.cm.gray)
    plt.show()

**Above you see that there are 15 epoches running in fit, but I made more epoches before, and retrained model by 15
every run, to know when model starts working well. So, don't think that there was only 15 epoches. Accuracy of irises
are not ideal, and reason for this can be - small pixel(3х3 cross) for iris in initial data**