In [3]:
import pandas as pd
import re
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from keras.layers import Input, Dense
import numpy as np

# Reading log data from file
with open('/content/output.log', 'r') as file:
    log_data = file.read()

# Parsing log entries
pattern = re.compile(r'(\d+\.\d+\.\d+\.\d+) \[([^\]]+)\] \[([^\]]+)\] (\d+) (\d+) \[\[([^\]]+)\]\] (\d+)')
matches = pattern.findall(log_data.strip())

# Creating a DataFrame
columns = ['IP', 'Timestamp', 'Request', 'Status', 'Bytes', 'UserAgent', 'Duration']
log_df = pd.DataFrame(matches, columns=columns)

# Convert numerical columns to appropriate data types
log_df['Status'] = log_df['Status'].astype(int)
log_df['Bytes'] = log_df['Bytes'].astype(int)
log_df['Duration'] = log_df['Duration'].astype(int)

# Parsing the timestamp
log_df['Timestamp'] = pd.to_datetime(log_df['Timestamp'], format='%Y-%m-%dT%H:%M:%S.%f%z')

# Extracting HTTP method and URL from the Request field
log_df[['HTTPMethod', 'URL']] = log_df['Request'].str.extract(r'(\w+) (.+)')

# Drop the original Request column
log_df.drop(columns=['Request'], inplace=True)

# Extracting user agent categories (e.g., browser, bot)
log_df['UserAgentCategory'] = log_df['UserAgent'].apply(lambda x: 'Bot' if 'bot' in x.lower() else 'Browser')

# Selecting features for the autoencoder
features = ['Status', 'Bytes', 'Duration']
X = log_df[features].values

# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Autoencoder model
input_dim = X_scaled.shape[1]
encoding_dim = 2  # Number of features in the compressed representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Training the autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=16, shuffle=True, validation_split=0.2)

# Get reconstruction error
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)

# Set a threshold for anomaly detection
threshold = np.percentile(mse, 95)

# Flag anomalies
log_df['Anomaly'] = mse > threshold

# Show anomalies
anomalies = log_df[log_df['Anomaly']]
print(anomalies)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
                   IP                 Timestamp  Status   Bytes  \
13     35.110.222.153 2021-05-12 05:06:00+04:30     200  332023   
78       4.138.31.112 2021-05-12 05:06:03+04:30     200  332023   
102    35.117.114.180 2021-05-12 05:06:03+04:30     200  768822   
104    226.152.63.215 2021-05-12 05:06:03+04:30     200  332023   
130    35.103.240.193 2021-05-12 05:06:04+04:30     200  332023   
...    