<a href="https://colab.research.google.com/github/crux007/crux007/blob/main/Bot_detection_using_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
import re
import datetime as datetime
import ipaddress
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score, adjusted_rand_score

In [3]:
df = pd.read_csv("/content/drive/MyDrive/deforb.csv")

In [4]:
df.head()

Unnamed: 0,timestamp,c-ip,time-to-first-byte,sc-status,sc-bytes,cs-method,cs-protocol,cs-host,cs-uri-stem,cs-bytes-request,time-taken,cs-protocol-version,x-edge-response-result-type,ssl2protocol,ssl0cipher,x-edge-result-type,c-port,x-edge-detailed-result-page
0,1680273117,52.167.144.170,0.097,200,4940,0,1,0,/job/809046540-clinical-nurse-consultant-digit...,281,0.097,0,0,0,2,0,21251,0
1,1680273115,66.249.66.3,0.038,308,421,0,1,1,/weblog/,250,0.038,1,0,0,2,0,37691,0
2,1680273117,185.138.241.168,0.71,200,5109,0,1,1,/job/812455770-ft-patient-scheduling-represent...,492,0.71,1,0,0,2,0,56324,0
3,1680273118,52.167.144.170,0.082,200,5465,0,1,0,/job/810286523-registered-nurses-rual-and-remo...,243,0.082,0,0,0,2,0,21250,0
4,1680273119,50.62.147.101,0.003,200,405,1,1,1,/,146,0.003,0,1,0,2,1,63740,1


In [5]:
df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
df.dropna(subset=['timestamp'], inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df['time_of_day'] = df['timestamp'].dt.time
df['day_of_week'] = df['timestamp'].dt.day_name()
df = df.drop(['timestamp', 'cs-uri-stem'], axis=1)

In [6]:
def ip_to_int(c_ip):
    return int(ipaddress.IPv4Address(c_ip))

df['c-ip-int'] = df['c-ip'].apply(ip_to_int)
unique_ips = df['c-ip'].unique()

In [7]:
df['day_of_week'] = df['day_of_week'].replace({
    'Mon': 1,
    'Tue': 2,
    'Wed': 3,
    'Thu': 4,
    'Fri': 5,
    'Sat': 6,
    'Sun': 7
})

In [8]:
df['time_of_day'] = pd.to_datetime(df['time_of_day'], format='%H:%M:%S')
df['hour_int'] = df['time_of_day'].dt.hour
df['minute_int'] = df['time_of_day'].dt.minute
df['second_int'] = df['time_of_day'].dt.second

In [9]:
df.head()

Unnamed: 0,c-ip,time-to-first-byte,sc-status,sc-bytes,cs-method,cs-protocol,cs-host,cs-bytes-request,time-taken,cs-protocol-version,...,ssl0cipher,x-edge-result-type,c-port,x-edge-detailed-result-page,time_of_day,day_of_week,c-ip-int,hour_int,minute_int,second_int
0,52.167.144.170,0.097,200,4940,0,1,0,281,0.097,0,...,2,0,21251,0,1900-01-01 14:31:57,Friday,883396778,14,31,57
1,66.249.66.3,0.038,308,421,0,1,1,250,0.038,1,...,2,0,37691,0,1900-01-01 14:31:55,Friday,1123631619,14,31,55
2,185.138.241.168,0.71,200,5109,0,1,1,492,0.71,1,...,2,0,56324,0,1900-01-01 14:31:57,Friday,3112890792,14,31,57
3,52.167.144.170,0.082,200,5465,0,1,0,243,0.082,0,...,2,0,21250,0,1900-01-01 14:31:58,Friday,883396778,14,31,58
4,50.62.147.101,0.003,200,405,1,1,1,146,0.003,0,...,2,1,63740,1,1900-01-01 14:31:59,Friday,842961765,14,31,59


In [10]:
features = ['c-ip-int', 'time-to-first-byte', 'sc-status', 'sc-bytes', 'cs-method', 'cs-protocol',
            'cs-bytes-request', 'time-taken', 'cs-protocol-version', 'x-edge-response-result-type',
            'hour_int', 'minute_int', 'second_int', 'ssl0cipher', 'c-port', 'cs-host']
data = df[features]

In [12]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [13]:
n_clusters = 2  # Number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(scaled_data)



In [14]:
# Local Outlier Factor (LOF)
lof = LocalOutlierFactor(n_neighbors=20, contamination='auto')
outlier_scores = lof.fit_predict(scaled_data)

In [15]:
# Identify bot users
bot_users = data[outlier_scores == -1]
non_bot_users = data[outlier_scores == 1]

In [16]:
# Print the results
print("Number of bot users:", bot_users.shape[0])
print("Number of non-bot users:", non_bot_users.shape[0])
print("Percentage of bot users:", bot_users.shape[0] / data.shape[0])

Number of bot users: 78
Number of non-bot users: 993
Percentage of bot users: 0.07282913165266107


In [17]:
df['label'] = 0  # Initialize label column with 0 for non-bot users
df.loc[outlier_scores == -1, 'label'] = 1  # Set label as 1 for bot users

In [18]:
kmeans_silhouette = silhouette_score(scaled_data, kmeans.labels_)
kmeans_homogeneity = homogeneity_score(df['label'], kmeans.labels_)
kmeans_completeness = completeness_score(df['label'], kmeans.labels_)
kmeans_ari = adjusted_rand_score(df['label'], kmeans.labels_)

In [19]:
print("K-means Clustering Metrics:")
print("Silhouette Score:", kmeans_silhouette)
print("Homogeneity Score:", kmeans_homogeneity)
print("Completeness Score:", kmeans_completeness)
print("Adjusted Rand Index (ARI):", kmeans_ari)

K-means Clustering Metrics:
Silhouette Score: 0.28357100236633803
Homogeneity Score: 0.012369269942627551
Completeness Score: 0.004770430933270134
Adjusted Rand Index (ARI): 0.014733256101959994


In [20]:
# Compute metrics for LOF
lof_silhouette = silhouette_score(scaled_data, -outlier_scores)
lof_homogeneity = homogeneity_score(df['label'], -outlier_scores)
lof_completeness = completeness_score(df['label'], -outlier_scores)
lof_ari = adjusted_rand_score(df['label'], -outlier_scores)

In [21]:
print("\nLOF Metrics:")
print("Silhouette Score:", lof_silhouette)
print("Homogeneity Score:", lof_homogeneity)
print("Completeness Score:", lof_completeness)
print("Adjusted Rand Index (ARI):", lof_ari)


LOF Metrics:
Silhouette Score: 0.2851248178051259
Homogeneity Score: 1.0
Completeness Score: 1.0
Adjusted Rand Index (ARI): 1.0


In [22]:
df['label_bot'] = 0  # Initialize 'label_bot' column with 0 for non-bot users
df.loc[outlier_scores == -1, 'label_bot'] = 1  # Set 'label_bot' as 1 for bot users

In [23]:
df.head()

Unnamed: 0,c-ip,time-to-first-byte,sc-status,sc-bytes,cs-method,cs-protocol,cs-host,cs-bytes-request,time-taken,cs-protocol-version,...,c-port,x-edge-detailed-result-page,time_of_day,day_of_week,c-ip-int,hour_int,minute_int,second_int,label,label_bot
0,52.167.144.170,0.097,200,4940,0,1,0,281,0.097,0,...,21251,0,1900-01-01 14:31:57,Friday,883396778,14,31,57,0,0
1,66.249.66.3,0.038,308,421,0,1,1,250,0.038,1,...,37691,0,1900-01-01 14:31:55,Friday,1123631619,14,31,55,0,0
2,185.138.241.168,0.71,200,5109,0,1,1,492,0.71,1,...,56324,0,1900-01-01 14:31:57,Friday,3112890792,14,31,57,1,1
3,52.167.144.170,0.082,200,5465,0,1,0,243,0.082,0,...,21250,0,1900-01-01 14:31:58,Friday,883396778,14,31,58,0,0
4,50.62.147.101,0.003,200,405,1,1,1,146,0.003,0,...,63740,1,1900-01-01 14:31:59,Friday,842961765,14,31,59,1,1


In [24]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [25]:
n_clusters = 2  # Number of clusters
n_models = 5  # Number of K-means models in the ensemble
kmeans_ensemble = []

In [26]:
for _ in range(n_models):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(scaled_data)
    kmeans_ensemble.append(kmeans)



In [27]:
n_neighbors = 20  # Number of neighbors
n_models = 5  # Number of LOF models in the ensemble
lof_ensemble = []

In [28]:
for _ in range(n_models):
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination='auto')
    outlier_scores = lof.fit_predict(scaled_data)
    lof_ensemble.append(lof)

In [42]:
kmeans_predictions = [model.predict(scaled_data) for model in kmeans_ensemble]
lof_predictions = [model.fit_predict(scaled_data) for model in lof_ensemble]

In [41]:
ensemble_predictions = []
for i in range(len(scaled_data)):
    kmeans_votes = sum([prediction[i] for prediction in kmeans_predictions])
    lof_votes = sum([prediction[i] for prediction in lof_predictions])
    if kmeans_votes >= len(kmeans_predictions) / 2 and lof_votes <= len(lof_predictions) / 2:
        ensemble_predictions.append(1)  # Bot user
    else:
        ensemble_predictions.append(0)  # Non-bot user

In [40]:
true_labels = df['label_bot'] 
accuracy = accuracy_score(true_labels, ensemble_predictions)
precision = precision_score(true_labels, ensemble_predictions)
recall = recall_score(true_labels, ensemble_predictions)
f1 = f1_score(true_labels, ensemble_predictions)

In [32]:
print("Ensemble Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Ensemble Performance:
Accuracy: 0.9598506069094305
Precision: 1.0
Recall: 0.44871794871794873
F1-score: 0.6194690265486726


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [33]:
features = ['c-ip-int', 'time-to-first-byte', 'sc-status', 'sc-bytes', 'cs-method', 'cs-protocol',
            'cs-bytes-request', 'time-taken', 'cs-protocol-version', 'x-edge-response-result-type',
            'hour_int', 'minute_int', 'second_int', 'ssl0cipher', 'c-port', 'cs-host']
data = df[features]
target = df['label_bot']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [35]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [36]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_scaled.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

@sam we can still play around these activation functions. tanh is another activation function that coulld be useful

In [37]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [38]:
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb7a02e3190>

In [39]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.934883713722229
