In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install xlstm

Collecting xlstm
  Downloading xlstm-1.0.5-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xlstm-1.0.5-py3-none-any.whl (95 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/95.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.3/95.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlstm
Successfully installed xlstm-1.0.5


In [8]:
!apt-get install ninja-build

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  ninja-build
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 111 kB of archives.
After this operation, 358 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ninja-build amd64 1.10.1-1 [111 kB]
Fetched 111 kB in 0s (315 kB/s)
Selecting previously unselected package ninja-build.
(Reading database ... 123597 files and directories currently installed.)
Preparing to unpack .../ninja-build_1.10.1-1_amd64.deb ...
Unpacking ninja-build (1.10.1-1) ...
Setting up ninja-build (1.10.1-1) ...
Processing triggers for man-db (2.10.2-1) ...


## Imports

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xlstm import (
    xLSTMBlockStack,
    xLSTMBlockStackConfig,
    mLSTMBlockConfig,
    mLSTMLayerConfig,
    sLSTMBlockConfig,
    sLSTMLayerConfig,
    FeedForwardConfig,
)

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Case 1

In [4]:
# Завантаження та попередня обробка даних
path_to_dataset = "/content/drive/MyDrive/diploma/datasets/HateSpeechDatasetBalanced.csv"
df = pd.read_csv(path_to_dataset)
df = df.sample(frac=0.05, random_state=42)  # Використання частини даних для швидкості
df = df[['Content', 'Label']]

def preprocess(text):
    return word_tokenize(text.lower())

df['Content'] = df['Content'].apply(preprocess)

In [5]:
# Розділення на тренувальну та тестову вибірки
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Побудова словника
all_words = [word for tweet in train_df['Content'] for word in tweet]
vocab = sorted(set(all_words))
word_to_idx = {word: idx+1 for idx, word in enumerate(vocab)}
vocab_size = len(vocab) + 1  # +1 для padding

# Кодування твітів
def encode_tweet(tweet):
    return [word_to_idx[word] for word in tweet if word in word_to_idx]

train_df['Content'] = train_df['Content'].apply(encode_tweet)
test_df['Content'] = test_df['Content'].apply(encode_tweet)

# Додавання padding
def pad_sequences(sequences, max_len):
    return np.array([seq + [0]*(max_len-len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences])

MAX_LEN = 100
X_train = pad_sequences(train_df['Content'], MAX_LEN)
X_test = pad_sequences(test_df['Content'], MAX_LEN)

# Кодування міток
le = LabelEncoder()
y_train = le.fit_transform(train_df['Label'])
y_test = le.transform(test_df['Label'])

# Створення датасетів та DataLoader
class HateSpeechDataset(Dataset):
    def __init__(self, tweets, labels):
        self.tweets = tweets
        self.labels = labels

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        return torch.tensor(self.tweets[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = HateSpeechDataset(X_train, y_train)
test_dataset = HateSpeechDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [20]:
# Конфігурація xLSTM
cfg = xLSTMBlockStackConfig(
    mlstm_block=mLSTMBlockConfig(
        mlstm=mLSTMLayerConfig(
            conv1d_kernel_size=4, qkv_proj_blocksize=4, num_heads=4
        )
    ),
    slstm_block=sLSTMBlockConfig(
        slstm=sLSTMLayerConfig(
            backend="vanilla",
            num_heads=4,
            conv1d_kernel_size=4,
            bias_init="powerlaw_blockdependent",
        ),
        feedforward=FeedForwardConfig(proj_factor=1.3, act_fn="gelu"),
    ),
    context_length=MAX_LEN,
    num_blocks=7,
    embedding_dim=128,
    slstm_at=[1],
)

In [21]:
# Ініціалізація xLSTM стеку
xlstm_stack = xLSTMBlockStack(cfg).to("cuda")

In [22]:
# Модель для виявлення мови ворожнечі
class HateSpeechModel(nn.Module):
    def __init__(self, xlstm_stack, vocab_size, embedding_dim):
        super(HateSpeechModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.xlstm = xlstm_stack
        self.fc = nn.Linear(embedding_dim, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = self.xlstm(x)
        x = self.fc(x.mean(dim=1))
        return x

In [23]:
# Ініціалізація моделі
model = HateSpeechModel(xlstm_stack, vocab_size, 128).to("cuda")
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
from tqdm import tqdm

# Навчання моделі з прогрес-баром
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        # Додаємо прогрес-бар для епохи
        for tweets, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            tweets, labels = tweets.to("cuda"), labels.to("cuda")
            optimizer.zero_grad()
            outputs = model(tweets)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}')

# Тестування моделі з прогрес-баром
def test_model(model, test_loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        # Додаємо прогрес-бар для тестування
        for tweets, labels in tqdm(test_loader, desc='Testing'):
            tweets, labels = tweets.to("cuda"), labels.to("cuda")
            outputs = model(tweets)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
    return all_preds

In [27]:
# Запуск навчання
train_model(model, train_loader, criterion, optimizer)

Epoch 1/10: 100%|██████████| 454/454 [01:59<00:00,  3.80it/s]


Epoch [1/10], Loss: 211.7664


Epoch 2/10: 100%|██████████| 454/454 [01:58<00:00,  3.84it/s]


Epoch [2/10], Loss: 147.6509


Epoch 3/10: 100%|██████████| 454/454 [01:57<00:00,  3.85it/s]


Epoch [3/10], Loss: 91.1941


Epoch 4/10: 100%|██████████| 454/454 [01:56<00:00,  3.90it/s]


Epoch [4/10], Loss: 55.3721


Epoch 5/10: 100%|██████████| 454/454 [01:57<00:00,  3.87it/s]


Epoch [5/10], Loss: 34.2970


Epoch 6/10: 100%|██████████| 454/454 [01:56<00:00,  3.90it/s]


Epoch [6/10], Loss: 26.9716


Epoch 7/10: 100%|██████████| 454/454 [01:56<00:00,  3.90it/s]


Epoch [7/10], Loss: 21.2874


Epoch 8/10: 100%|██████████| 454/454 [01:55<00:00,  3.94it/s]


Epoch [8/10], Loss: 14.4445


Epoch 9/10: 100%|██████████| 454/454 [01:52<00:00,  4.02it/s]


Epoch [9/10], Loss: 14.9162


Epoch 10/10: 100%|██████████| 454/454 [01:53<00:00,  3.99it/s]

Epoch [10/10], Loss: 16.0476





In [28]:
preds = test_model(model, test_loader)
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy:.4f}')

Testing: 100%|██████████| 114/114 [00:08<00:00, 12.81it/s]

Accuracy: 0.7850





In [29]:
model_save_path = '/content/drive/MyDrive/diploma/hate_speech_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/diploma/hate_speech_model.pth


In [None]:
# Імпорт моделі
# model = HateSpeechModel(xlstm_stack, vocab_size, 128).to("cuda")
# model.load_state_dict(torch.load(model_save_path))
# model.eval()
# print("Model loaded and ready to use")

In [11]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [12]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
!pip install ninja

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu121
Collecting ninja
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.11.1.1


In [13]:
import shutil
shutil.rmtree('/root/.cache/torch_extensions/py310_cu121')

In [16]:
!apt-get install --reinstall ninja-build

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
0 upgraded, 0 newly installed, 1 reinstalled, 0 to remove and 49 not upgraded.
Need to get 111 kB of archives.
After this operation, 0 B of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ninja-build amd64 1.10.1-1 [111 kB]
Fetched 111 kB in 0s (224 kB/s)
(Reading database ... 123609 files and directories currently installed.)
Preparing to unpack .../ninja-build_1.10.1-1_amd64.deb ...
Unpacking ninja-build (1.10.1-1) over (1.10.1-1) ...
Setting up ninja-build (1.10.1-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [17]:
!chmod -R 777 /root/.cache/torch_extensions/

## Case 2