<a href="https://www.kaggle.com/code/constantinedivis/text-classification-with-embeddingbag-88-accur?scriptVersionId=142888161" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Data loading

In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv('/kaggle/input/russian-social-media-text-classification/train.csv')
test = pd.read_csv('/kaggle/input/russian-social-media-text-classification/test.csv')
sample_submission = pd.read_csv('/kaggle/input/russian-social-media-text-classification/sample_submission.csv')
CLASSES = list(train['category'].unique())
CLASSES

In [None]:
train.head()

## Drop duplicates

In [None]:
len(train)

In [None]:
len(train.text.unique())

In [None]:
train.drop_duplicates(subset=['text'], inplace=True)

## Data splitting

In [None]:
df_train, df_val, df_test = np.split(train.sample(frac=1, random_state=42),
                                     [int(.85*len(train)), int(.95*len(train))])

dic_train = df_train.to_dict('index')
dic_val = df_val.to_dict('index')
dic_test = df_test.to_dict('index')


# Data preprocessing

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
from nltk.corpus import stopwords

import string
import re

!pip install pymorphy2
import pymorphy2

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
morph = pymorphy2.MorphAnalyzer()

stop = set(stopwords.words('russian'))
stop.add('это')
stop.remove('я')

exclude = set(string.punctuation)

tok_for_del = list(stop) + list(exclude)

## Lemmatisation

In [None]:
def str_lemm(str):
    
    """
    string lemmatisation
    """

    raw = str.lower()

    tokens = tokenizer.tokenize(raw)

    tokens_clean_wo_stop = [j for j in tokens if not j in tok_for_del]

    tokens_clean = [re.sub('^\d*\n*|\n*\d*$', '', j) for j in tokens_clean_wo_stop]

    tokens_clean = [j for j in tokens_clean if len(j) > 1]

    lemms_p_of_s = [(morph.parse(j)[0]) for j in tokens_clean if j != '']
    lemms = [j[2] for j in lemms_p_of_s]

    return lemms

In [None]:
def sample_lemm(data):
    """
    sample lemmatisation
    """
    dct = dict()

    for k in data:
        dct[k] = (data[k]['category'], str_lemm(data[k]['text']))

    return dct

In [None]:
dic_train_texts_lemm = sample_lemm(dic_train)
dic_val_texts_lemm = sample_lemm(dic_val)
dic_test_texts_lemm = sample_lemm(dic_test)

In [None]:
def dic2lst(dct):
    """
    dict to lists
   
    """
    texts = []
    categories = []

    for k in dct:
        texts.append(dct[k][1])
        categories.append(dct[k][0])

    return texts, categories

In [None]:
train_texts_text, train_texts_cat = dic2lst(dic_train_texts_lemm)
val_texts_text, val_texts_cat = dic2lst(dic_val_texts_lemm)
test_texts_text, test_texts_cat = dic2lst(dic_test_texts_lemm)

# Classification with EmbeddingBag - PyTorch

In [None]:
import torch

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from sklearn import preprocessing

from torch import nn

from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

!pip install torcheval
from torcheval.metrics.functional import multiclass_f1_score

import time

# Data preparation

In [None]:
# tokens to strings
train_texts = [' '.join(i) for i in train_texts_text]
test_texts = [' '.join(i) for i in test_texts_text]
val_texts = [' '.join(i) for i in val_texts_text]

In [None]:
# label encoding
le = preprocessing.LabelEncoder()
le.fit(train_texts_cat)
le.classes_

In [None]:
train_lbl = le.transform(train_texts_cat)
test_lbl = le.transform(test_texts_cat)
val_lbl = le.transform(val_texts_cat)

In [None]:
def torch_data_prepar(X, y):

    """
    iterator for data
    """
    out = []

    for i in range(0, len(X)):
        out.append((y[i], X[i]))

    return iter(out)

In [None]:
train_iter = torch_data_prepar(X = train_texts, y = train_lbl)

vocab = build_vocab_from_iterator(i.split() for i in train_texts)

words = set(vocab.get_itos())

test_texts_ed = [[i for i in l.split() if i in words] for l in test_texts]

val_texts_ed = [[i for i in l.split() if i in words] for l in val_texts]

val_iter = torch_data_prepar(X = [" ".join(i) for i in val_texts_ed], y = val_lbl)
test_iter = torch_data_prepar(X = [" ".join(i) for i in test_texts_ed], y = test_lbl)

text_pipeline = lambda x: vocab([i for i in x.split()])
label_pipeline = lambda x: int(x)

In [None]:
len(vocab.get_itos())

In [None]:
text_pipeline("хоккей просто")

# Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [None]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(embed_dim, num_class)
#         self.dropout = nn.Dropout(0.3)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [None]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

In [None]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [None]:
num_class = len(set([i for i in train_lbl]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

EPOCHS = 6
LR = 0.001
BATCH_SIZE = 32

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

In [None]:
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
val_dataset = to_map_style_dataset(val_iter)


train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

In [None]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
          scheduler.step()
    else:
           total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))

    print('-' * 59)

In [None]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:5.2f}'.format(accu_test))

## $F1$

In [None]:
test_dataloader2 = DataLoader(test_dataset, shuffle=True, collate_fn=collate_batch)

model.eval()

predicted_label_all = []
true_lbls = []

with torch.no_grad():

    for idx, (label, text, offsets) in enumerate(test_dataloader2):
        predicted_label = model(text, offsets)
        true_lbls.append(label[0])
        predicted_label_all.append(predicted_label)

y_pred = [j.argmax() for i in predicted_label_all for j in i]

f1_score = multiclass_f1_score(torch.stack(y_pred), 
                               torch.stack(true_lbls), 
                               num_classes=13)

print(f'F1 = {f1_score:.3f}')