In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import os

In [5]:
human = os.listdir("./gokul")
bot = os.listdir("./other")
complete = pd.DataFrame(columns=['timestamp', 'x_position', 'y_position', 'button', 'click', 'key',
           'press', 'speed', 'moved'])

for cls, string in ((human, "./gokul/"), (bot, "./other/")):
    for i in cls:
        data = pd.read_csv(string + i)
        try:
            data = data.drop(["dx", "dy"], axis=1)
        except: pass
        data = data.fillna(-1)
        data = data.replace(True, 1)
        data = data.replace(False, 0)
        data["timestamp"] = pd.to_datetime(data["timestamp"])
        data["timestamp"] = data["timestamp"].diff().dt.total_seconds()
        data["speed"] = np.sqrt(data["x_position"].diff() ** 2 + data["y_position"].diff() ** 2) / data["timestamp"]
        data["moved"] = data["x_position"].diff() + data["y_position"].diff()
        data["label"] = (1 if string == "./other/" else 0)
        data = data.fillna(method="ffill").iloc[1:, :]
        data = data.fillna(0)
        data.reset_index(drop=True, inplace=True)
        complete = pd.concat([complete, data], axis=0, ignore_index=True)

  data = data.replace(True, 1)
  data = data.replace(False, 0)
  data = data.fillna(method="ffill").iloc[1:, :]
  complete = pd.concat([complete, data], axis=0, ignore_index=True)
  data = data.replace(True, 1)
  data = data.replace(False, 0)
  data = data.fillna(method="ffill").iloc[1:, :]
  data = data.replace(True, 1)
  data = data.replace(False, 0)
  data = data.fillna(method="ffill").iloc[1:, :]
  data = data.replace(True, 1)
  data = data.replace(False, 0)
  data = data.fillna(method="ffill").iloc[1:, :]
  data = data.replace(True, 1)
  data = data.replace(False, 0)
  data = data.fillna(method="ffill").iloc[1:, :]
  data = data.replace(True, 1)
  data = data.replace(False, 0)
  data = data.fillna(method="ffill").iloc[1:, :]
  data = data.replace(True, 1)
  data = data.replace(False, 0)
  data = data.fillna(method="ffill").iloc[1:, :]
  data = data.replace(True, 1)
  data = data.replace(False, 0)
  data = data.fillna(method="ffill").iloc[1:, :]
  data = data.replace(True, 1)
  dat

In [6]:
categories = [['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 
                  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 
                  '.', ',', '?', '!', ':', ';', '"', "'", '-', '(', ')', '[', ']', '{', '}', 
                  '+', '*', '/', '=', '<', '>', "ae", "#", "_", "|"
                  ]]

complete["key"] = complete["key"].replace(-1, "ae")
encoder = OneHotEncoder(categories=categories, sparse_output=False)
encoded = encoder.fit_transform(complete[["key"]])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['key']))
df_encoded = pd.concat([complete.drop('key', axis=1), encoded_df], axis=1)
df_encoded = df_encoded.drop("key_ae", axis=1)
df_encoded["button"] = df_encoded["button"].astype(float)
df_encoded["click"] = df_encoded["click"].astype(float)
df_encoded["press"] = df_encoded["press"].astype(float)

In [7]:
df_encoded["button"].unique

<bound method Series.unique of 0       -1.0
1       -1.0
2       -1.0
3       -1.0
4       -1.0
        ... 
30295   -1.0
30296   -1.0
30297   -1.0
30298   -1.0
30299   -1.0
Name: button, Length: 30300, dtype: float64>

In [8]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size=5, hidden_layer_size=50, output_size=2):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)
        self.fc = nn.Linear(hidden_layer_size, output_size)

    def forward(self, x):
        out, (h_n, c_n) = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [9]:
def create_sequences(data, seq_length, overlap):
    sequences = []
    result = []
    count = 0
    
    for i in range(0, len(data) - seq_length + 1, seq_length - overlap):
        count += 1
        sequence = df_encoded.iloc[i:i + seq_length]
        label = sequence["label"].max()
        sequence = sequence.drop("label", axis=1).values
        sequences.append(sequence)
        result.append(label)

    return (np.array(sequences), np.array(result))


sequence_length = 50
sequences, labels = create_sequences(df_encoded, sequence_length, 30)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=94)

In [11]:
x_train[np.isnan(x_train)] = 0
x_train[np.isinf(x_train)] = 0

In [12]:
model = LSTMClassifier(input_size=94, hidden_layer_size=188, output_size=2)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 20
for epoch in range(epochs):
    model.train()
    
    optimizer.zero_grad()
    y_pred = model(torch.FloatTensor(x_train))
    loss = loss_function(y_pred, torch.LongTensor(y_train))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

Epoch 1, Loss: 0.6520
Epoch 2, Loss: 0.5730
Epoch 3, Loss: 0.5111
Epoch 4, Loss: 0.4655
Epoch 5, Loss: 0.4314
Epoch 6, Loss: 0.4079
Epoch 7, Loss: 0.3913
Epoch 8, Loss: 0.3796
Epoch 9, Loss: 0.3715
Epoch 10, Loss: 0.3659
Epoch 11, Loss: 0.3626
Epoch 12, Loss: 0.3606
Epoch 13, Loss: 0.3602
Epoch 14, Loss: 0.3602
Epoch 15, Loss: 0.3587
Epoch 16, Loss: 0.3564
Epoch 17, Loss: 0.3538
Epoch 18, Loss: 0.3513
Epoch 19, Loss: 0.3488
Epoch 20, Loss: 0.3469


In [13]:
model.eval()
with torch.no_grad():
    y_test_pred = model(torch.FloatTensor(x_test))
    y_test_pred_class = torch.argmax(y_test_pred, dim=1)

accuracy = accuracy_score(y_test, y_test_pred_class.numpy())
print(f'Test Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_test_pred_class.numpy()))

Test Accuracy: 88.45%
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        35
         1.0       0.88      1.00      0.94       268

    accuracy                           0.88       303
   macro avg       0.44      0.50      0.47       303
weighted avg       0.78      0.88      0.83       303



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
torch.save(model.state_dict(), 'model_test.pth')