In [None]:
import numpy as np
import random as rd
import time
import matplotlib.pyplot as plt

In [None]:
'''
Function to read the file and return the features and labels as numpy arrays
 Arguments:
  filename – name of the spam dataset file
    12 columns: 11 features/dimensions (X) + 1 column with labels (Y)
       Y -- Train labels (0 if normal traffic, 1 if botnet) 
    m rows: number of examples (m)

 Returns:
  An numpy array containing the data of filename. 
  Each element of the array is a tuple (X,y).
  “X” is an array containing the 11 features (float number) of an example
  “y” is the 12th column of an example (integer 0/1)
'''
def readFile(filename):
    # Read the CSV file into a NumPy array
    data = np.loadtxt(filename, delimiter=',')

    # Extract features (X) and labels (Y)
    X = data[:, :-1]  # All rows, all columns except the last one
    y = data[:, -1]   # All rows, only the last column

    # Combine features and labels into tuples
    npArray_Xy = np.array([(X[i], int(y[i])) for i in range(len(y))], dtype=object)
    
    return npArray_Xy

In [None]:
'''
Function to normalize the features in X
 Arguments:
  npArray_Xy is an numpy array containing data examples.
  Each element of the array is a tuple (X,y).
  “X” is an array containing the 11 features (float number) of an example
  “y” is the label of the example (integer 0/1)
  
 Returns:
  An array rescaled to N(0,1) in each column (mean=0, standard deviation=1)
'''
def normalize(npArray_Xy):
    # Extract array with features
    X = npArray_Xy[:, 0] # All rows, first column (corresponding to the array with features)
    dim = (len(X),len(X[0]))
    
    # Initialize mean
    means = np.zeros(dim[1])    

    # Compute mean for each column
    for i in range(dim[1]):
        for j in range(dim[0]):
            means[i] += X[j][i]
    
    for i in range(dim[1]):
        means[i] /= dim[0]
    
    # Initialize variance (needed to compute standard deviation)
    variances = np.zeros(dim[1])
    
    # Compute variance for each column
    for i in range(dim[1]):
        for j in range(dim[0]):
            variances[i] += ((X[j][i]-means[i])**2)
            
    for i in range(dim[1]):
        variances[i] /= dim[0]
        
    # Normalize by substracting expected value and dividing by standard deviation (sqrt(var))
    for i in range(dim[1]):
        for j in range(dim[0]):
            X[j][i] = (X[j][i]-means[i])/(variances[i]**0.5)
            
    return npArray_Xy

In [None]:
'''
Function to perform gradient descent and train the logistic regression model
 Arguments:
  npArray_Xy --- Numpy array containing data examples. Each record of the RDD is a tuple (X,y).
  “X” is an array containing the 11 features (float number) of an example
  “y” is the label of the example (integer 0/1)
  iterations -- number of iterations of the optimization loop
  learning_rate -- learning rate of the gradient descent
  lambda_reg – regularization rate
  
 Returns:
  A list or array containing the weights “w” and bias “b” at the end of the
  training process
'''
def train(npArray_Xy, iterations, learning_rate, lambda_reg):
    # Extract features and labels
    X = npArray_Xy[:, 0] # All rows, first column (corresponding to the array with features)
    y = npArray_Xy[:, 1] # All rows, second column
    
    dim = (len(X), len(X[0]))
    
    # Initialize an empty list to store the cost values
    J_values = []
    
    # Initialize weights and bias
    ws = []
    for i in range(dim[1]+1):
        ws.append(rd.uniform(-1,1))
    #ws = [0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5]
    
    for it in range(iterations):
        
        # Compute y_pred
        Y_pred = [0 for i in range(dim[0])]
        for i in range(dim[0]):
            for j in range(dim[1]):
                Y_pred[i] += X[i][j]*ws[j] # x1*w1 + x2*w2 + ...
            Y_pred[i] += ws[-1] # + b
            # Pass y_pred into sigmoidal function to be between 0 and 1
            Y_pred[i] = 1/(1+np.exp(-Y_pred[i]))     
            
        # Compute cost function to see the evolution
        J = 0
        
        for i in range(dim[0]):
            J += y[i]*np.log(Y_pred[i])+(1-y[i])*np.log(1-Y_pred[i])
        J *= -1/dim[0]
        
        add_term = 0
        for i in range(dim[1]):
            add_term += ws[i]**2
        add_term *= (lambda_reg/(2*dim[1]))
        J += add_term
        J_values.append(J)
        
        # Initialize weight derivatives
        dWs = [0 for i in range(dim[1]+1)]
        
        # Compute weight derivatives
        for i in range(dim[1]):
            for j in range(dim[0]):
                dWs[i] += (Y_pred[j]-y[j])*X[j][i]
            dWs[i] /= dim[0]
            dWs[i] += (lambda_reg/dim[1])*ws[i]
            
        # Compute bias derivative
        for j in range(dim[0]):
            dWs[-1] += (Y_pred[j]-y[j])
        dWs[-1] /= dim[0]
        
        # Compute new weights and bias
        for i in range(dim[1]+1):
            ws[i] -= learning_rate*dWs[i]

    axis = plt.gca()
    plt.plot(range(iterations), J_values, color="darkcyan")
    plt.title("Evolution of cost function per iteration")
    plt.xlabel("iterations")
    plt.ylabel("cost")
    axis.xaxis.set_ticks(range(iterations))
    axis.yaxis.set_ticks(np.arange(0, J_values[0], 0.1))
     
    return ws

In [None]:
'''
Function to compute the accuracy of the logistic regression model
 Arguments:
  npArray_Xy -- Numpy array containing examples to be predicted
  ws -- weights & bias

 Returns:
  accuracy -- the number of predictions that are correct divided by the total
  number of examples in npArray_Xy.
  Predict function can be used for predicting a single example
'''
def accuracy(npArray_Xy, ws):
    # Extract features and labels
    X = npArray_Xy[:, 0] # All rows, first column (corresponding to the array with features)
    y = npArray_Xy[:, 1] # All rows, second column
    
    (w,b) = (ws[0:-1], ws[-1])
    dim = (len(X), len(X[0]))
   
    # Compute Y_pred
    Y_pred = []
    for i in range(dim[0]):
        Y_pred.append(predict(X[i], ws))
    
    # Count correct predictions
    count = 0
    for i in range(dim[0]):
        if y[i] == Y_pred[i]:
            count += 1
    
    # Compute accuracy
    acc = count/dim[0]
    
    return acc

In [None]:
'''
Function to compute the Y_pred, the prediction of X
 Arguments:
  X – Example to be predicted
  ws -- weights & bias

 Returns:
  Y_pred – a value (0/1) corresponding to the prediction of X
'''
def predict(X, ws):
    # Initialize the prediction
    Y_pred = ws[-1]
    
    # Compute the prediction
    for i in range(len(X)):
        Y_pred += X[i]*ws[i]
    
    # Pass the prediction into sigmoidal function to be between 0 and 1
    Y_pred = 1/(1+np.exp(-Y_pred))
    
    # Finish the prediction by choosing 0 or 1
    return (Y_pred >= 0.5)

In [None]:
# MAIN

#path = "botnet_reduced_l.csv"
path = "botnet_tot_syn_l.csv"

start_time = time.time()

data = readFile(path)
data = normalize(data)

ws = train(data, 10, 1.5, 0.1)

acc = accuracy(data,ws)
print("acc:", acc)

end_time = time.time()
print("Execution time: ", end_time - start_time)
