In [1]:
import numpy as np

In [2]:
def readFile(filename):
    with open(filename, 'r') as csv_file:
        lines = csv_file.readlines()
    X = []
    Y = []
    
    for l in lines:
        row = l.strip().split(",")
        X.append([float(x) for x in row[:-1]])
        Y.append(int(row[-1]))
    X = np.array(X)
    Y = np.array(Y)
    data = np.column_stack((X, Y))

    return data


In [3]:
#Testing of the read function
d = readFile('data/botnet_tot_syn_l.csv')
print(d.shape)
print("First 5 rows of data:\n", d[:5])

(1000000, 12)
First 5 rows of data:
 [[ 9.01278427e+00  1.67299998e+03  2.19999885e+01  9.99999745e-01
   6.19998877e+01  6.99998079e+01  1.30000002e+01  2.99999998e+00
   1.99000002e+02  2.46836957e+09  2.46837255e+09  1.00000000e+00]
 [ 3.59999909e+03  4.82065758e+04  1.33629998e+04  1.00000192e+00
   2.62999242e+02  8.29998876e+01  1.39999452e+01  5.00000003e+00
   2.16999996e+02  1.53904420e+09  2.46836839e+09  0.00000000e+00]
 [ 6.99938676e-04  9.47284469e-04  5.30002602e+01  2.00000042e+00
   6.46242903e+08  8.29998876e+01  1.30000000e+01  2.99999999e+00
   2.16999996e+02  2.46836954e+09  2.50325008e+09  0.00000000e+00]
 [ 3.59999909e+03  1.71877436e+03  6.33776887e+04  1.00000192e+00
   9.26682197e+05  5.43556891e+02  1.30000000e+01  5.00000003e+00
   1.86999994e+02  4.05749205e+08  1.12542449e+09  0.00000000e+00]
 [ 3.47137922e+03  4.80241859e+04  3.50889376e-04  5.07008976e+05
   2.62999242e+02  1.00012368e+03  1.30000000e+01 -7.71042474e-09
   7.99999924e+00  4.61400457e+08  

In [4]:
column = d[:, 1]
sum = 0
for i in column:
    sum += i
    #print(i)
print (sum)

21282767194.873116


In [5]:
def normalize(data):
    means = []
    stds = []

    for i in range(data.shape[1] - 1):
        column = data[:, i]
        mean = np.sum(column) / len(column)
        std = np.sqrt(np.sum((column - mean) ** 2) / len(column))
        
        means.append(mean)
        stds.append(std)

    means = np.array(means)
    stds = np.array(stds)

    for i in range(data.shape[1] - 1):
        data[:, i] = (data[:, i] - means[i]) / stds[i]

    return data

In [6]:
d = normalize(d)
print("First 5 rows of data:\n", d[:5])

First 5 rows of data:
 [[-0.79240977 -0.81309371 -0.42245076 -0.46646975 -0.52239296 -0.35631957
   0.7370103   0.52834963  0.82717799  0.47316616  0.15895172  1.        ]
 [ 1.4437204   1.116361    0.39127564 -0.46646975 -0.5223921  -0.35631927
   0.92633194  1.48012075  1.02641833 -0.82869545  0.15894853  0.        ]
 [-0.79802166 -0.88246246 -0.42055992 -0.4664623   2.24026466 -0.35631927
   0.73701025  0.52834964  1.02641833  0.47316612  0.18574901  0.        ]
 [ 1.4437204  -0.81119573  3.44189202 -0.46646975 -0.5184317  -0.35630884
   0.73701025  1.48012075  0.69435097 -2.41629145 -0.87286932  0.        ]
 [ 1.36362799  1.10879844 -0.42379262  3.30972605 -0.5223921  -0.35629849
   0.73701025 -0.89930699 -1.28698408 -2.33833143 -1.73756142  0.        ]]


In [7]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def train(data, iterations, learning_rate, lambda_reg):
    np.random.seed(0) 

    X, y = data[:, :-1], data[:, -1]
    w = np.random.rand(X.shape[1])
    b = np.random.rand()
    m = len(y)

    # For all the iterations
    for n in range(iterations):
        dw = np.zeros(len(w))
        db = 0
        total_loss = 0

        # Cost function computation
        for i in range(m):
            xi = X[i]
            yi = y[i]
            z = b

            # For all the features, computing the prediction y_hat
            for j in range(len(w)):
                z += w[j] * xi[j]
            y_hat = sigmoid(z)
            error = y_hat - yi

            # Aggregate for cost computation
            total_loss += (yi * np.log(y_hat)) + (1 - yi) * np.log(1 - y_hat)

        # Compute cost for the current iteration using aggregates
        cost = (-1 / m) * total_loss + (lambda_reg / (2 * m)) * np.sum(w ** 2)
        print(f"Iteration {n+1}/{iterations} - Cost: {cost}")

        # For all the rows, computing the gradient
        for i in range(m):
            xi = X[i]
            yi = y[i]
            z = b

            # For all the features, computing the prediction y_hat
            for j in range(len(w)):
                z += w[j] * xi[j]
            y_hat = sigmoid(z)
            error = y_hat - yi

            # Update gradients
            for j in range(len(w)):
                dw[j] += error * xi[j]
            db += error

        # Update weights and bias
        w -= learning_rate * (dw / m + (lambda_reg / m) * w)
        b -= learning_rate * db / m

        

    return np.append(w, b)


In [8]:
ws = train(d, 10, 1.5, 0)

w = ws[:-1]  # All elements except the last one are weights
b = ws[-1]   # The last element is the bias


Iteration 1/10 - Cost: 1.4998030671357732
Iteration 2/10 - Cost: 0.7452491273037543
Iteration 3/10 - Cost: 0.4463838653241005
Iteration 4/10 - Cost: 0.33598210709513854
Iteration 5/10 - Cost: 0.2858186861975213
Iteration 6/10 - Cost: 0.25817417377106544
Iteration 7/10 - Cost: 0.24076133573753333
Iteration 8/10 - Cost: 0.22876056610474765
Iteration 9/10 - Cost: 0.21995387544015582
Iteration 10/10 - Cost: 0.21318893422095375


In [9]:
def predict(w, b, x):
    z = b
    for j in range(len(w)):
        z += w[j] * x[j]

    y_hat = sigmoid(z)
    return 1 if y_hat > 0.5 else 0


In [10]:
def accuracy(w, b, data):
    X, y_true = data[:, :-1], data[:, -1]
    correct_predictions = 0
    m = len(y_true)

    for i in range(m):
        xi = X[i]
        yi = y_true[i]
        y_pred = predict(w, b, xi)
        if y_pred == yi:
            correct_predictions += 1

    return correct_predictions / m

In [11]:
acc = accuracy(w, b, d)
print(f"Accuracy: {acc * 100:.2f}%")

Accuracy: 93.02%
