In [1]:
import pandas as pd
from pymystem3 import Mystem
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import imblearn as ib
import nltk
from nltk.corpus import stopwords
import torch
import transformers
from torch import nn
from torch.optim import Adam
import numpy as np
from tqdm.notebook import tqdm

In [2]:
RANDOM_STATE=42

In [3]:
df_valid = pd.read_csv('../data/valid.csv')
df_train = pd.read_csv('../data/train.csv')

In [4]:
df_train.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,target,text_ru,speech_fn
0,706,706,Watching Xela firefighters struggle to save bu...,1,"Наблюдая за тем, как пожарные Xela борются за ...",706.mp3
1,707,707,Gates not body bagging nobody???????? niggas i...,0,Гейтс никого не запихивает в мешки с трупами??...,707.mp3
2,272,272,Firefigthers Evacuate from Northampton Townshi...,1,Пожарные эвакуируются из-за пожара в доме в Но...,272.mp3
3,479,479,FAAN orders evacuation of abandoned aircraft a...,1,FAA отдает приказ об эвакуации брошенных самол...,479.mp3
4,781,781,If you're reading this go accidentally fall of...,0,"Если ты читаешь это, то можешь случайно упасть...",781.mp3


In [5]:
def preprocess(df: pd.DataFrame, col: str = 'text_ru') -> pd.DataFrame:
    df = df[[col, 'target']]
    df.columns = ['text', 'target']
    df.text = df.text.str.replace('\n', ' ')
    return df


df_train = preprocess(df_train)
df_valid = preprocess(df_valid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.text = df.text.str.replace('\n', ' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.text = df.text.str.replace('\n', ' ')


In [6]:
m = Mystem()

re_arr = [
    re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'),
    re.compile(r'<.*?>'),
    re.compile(r'[%s]' % re.escape(string.punctuation)),
    re.compile(r'\s+'),
    re.compile(r'\[[0-9]*\]'),
    re.compile(r'[^\w\s]'),
    re.compile(r'\d'),
]

def simplify(s: str) -> str:
    x = s.lower().replace('\xa0', ' ').strip()
    for r in re_arr:
        x = r.sub(' ', x)
    return x

def lemmatize(text: str) -> str:
    lemmas = [x.strip() for x in m.lemmatize(text) if x.strip() != '']
    return ' '.join(lemmas)

def simplify_and_lemmatize(text: str) -> str:
    return lemmatize(simplify(text))

In [7]:
df_train['lemm'] = df_train.text.apply(simplify_and_lemmatize)
df_valid['lemm'] = df_valid.text.apply(simplify_and_lemmatize)

In [8]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english')).union(stopwords.words('russian'))
stop_words = list(stop_words)

[nltk_data] Downloading package stopwords to /Users/dima/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Count vectorizer

In [9]:
class CountModel():

    def fit(self, X, y):
        self.vect = CountVectorizer(stop_words=stop_words) 
        self.model = LogisticRegression(random_state=RANDOM_STATE, class_weight='balanced')
        self.pipe = ib.pipeline.Pipeline([
            ('vect', self.vect),       
            ('model', self.model)
        ])
        
        self.pipe.fit(X, y)
        
    def get_model(self):
        return self.vect, self.model
    
    def predict(self, X):
        return self.pipe.predict(X)

In [10]:
model = CountModel()
model.fit(df_train.lemm, df_train.target)

In [11]:
y_predicted = model.predict(df_valid.lemm)
accuracy_score(df_valid.target, y_predicted)

0.77

# TF-IDF Vectorizer

In [12]:
class TfidfModel():

    def fit(self, X, y):
        self.vect = TfidfVectorizer(stop_words=stop_words)
        self.model = LogisticRegression(random_state=RANDOM_STATE, class_weight='balanced')
        self.pipe = ib.pipeline.Pipeline([
            ('vect', self.vect),
            ('model', self.model)
        ])
        
        self.pipe.fit(X, y)
        
    def get_model(self):
        return self.vect, self.model
    
    def predict(self, X):
        return self.pipe.predict(X)

In [13]:
model = CountModel()
model.fit(df_train.lemm, df_train.target)

In [14]:
y_predicted = model.predict(df_valid.lemm)
accuracy_score(df_valid.target, y_predicted)

0.77

# Fine tuned BERT

In [15]:
bert_name = 'bert-base-multilingual-cased'

In [16]:
tokenizer = transformers.BertTokenizer.from_pretrained(bert_name)
bert = transformers.BertModel.from_pretrained(bert_name)

In [17]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, X, y):

        self.labels = y
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in X]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [18]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = bert
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, len(np.unique(df_train.target)))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [19]:
#!g1.1
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
if torch.backends.mps.is_available():
    device = torch.device("mps")
device

device(type='mps')

In [20]:
#!g1.1
def train(model, X_train, y_train, X_valid, y_valid, learning_rate, epochs):

    #X_train = X_train.head(100)
    #y_train = y_train[:100]
    #X_valid = X_valid.head(100)
    #y_valid = y_valid[:100]

    train, val = Dataset(X_train, y_train), Dataset(X_valid, y_valid)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch_num in range(epochs):
        
            model.train()

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():
                model.eval()

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(y_train): .3f} | Train Accuracy: {total_acc_train / len(y_train): .3f} | Val Loss: {total_loss_val / len(y_valid): .3f} | Val Accuracy: {total_acc_val / len(y_valid): .3f}')
                  

In [21]:
#!g1.1
def evaluate(model, X_test, y_test):

    #X_test = X_test.head(1000)
    #y_test = y_test[:1000]

    test = Dataset(X_test, y_test)

    test_dataloader = torch.utils.data.DataLoader(test, shuffle=False)

    model = model.to(device)
    model.eval()
    
    result = []

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in tqdm(test_dataloader):

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              result.append(output.argmax().cpu().detach().numpy().item())
    
    return result

In [22]:
#!g1.1
EPOCHS = 10
model = BertClassifier()
LR = 1e-6
              
train(model, df_train.text, df_train.target, df_valid.text, df_valid.target, LR, EPOCHS)

  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 1 | Train Loss:  0.344 | Train Accuracy:  0.527 | Val Loss:  0.335 | Val Accuracy:  0.565


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 2 | Train Loss:  0.328 | Train Accuracy:  0.610 | Val Loss:  0.319 | Val Accuracy:  0.645


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 3 | Train Loss:  0.306 | Train Accuracy:  0.693 | Val Loss:  0.291 | Val Accuracy:  0.695


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 4 | Train Loss:  0.282 | Train Accuracy:  0.735 | Val Loss:  0.264 | Val Accuracy:  0.750


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 5 | Train Loss:  0.245 | Train Accuracy:  0.802 | Val Loss:  0.249 | Val Accuracy:  0.775


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 6 | Train Loss:  0.216 | Train Accuracy:  0.820 | Val Loss:  0.237 | Val Accuracy:  0.805


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 7 | Train Loss:  0.193 | Train Accuracy:  0.860 | Val Loss:  0.222 | Val Accuracy:  0.805


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 8 | Train Loss:  0.170 | Train Accuracy:  0.870 | Val Loss:  0.217 | Val Accuracy:  0.830


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 9 | Train Loss:  0.147 | Train Accuracy:  0.902 | Val Loss:  0.223 | Val Accuracy:  0.825


  0%|          | 0/300 [00:00<?, ?it/s]

Epochs: 10 | Train Loss:  0.123 | Train Accuracy:  0.920 | Val Loss:  0.224 | Val Accuracy:  0.815


# Выводы
В отличии от реальных данных, в пробной тренировочной выборке дисбаланс классов не выражен. Для простоты мы использовали метрику Accuracy, хотя по дизайну должна быть Recall@Precision=99,5%

- 0.77 - CountVectorizer, TfIdfVectorizer
- **0.83** - Fine tuned BERT. Можно получить значительно лучше точность, но для большой модели нужно больше данных

Вцелом значимо увеличить качество можно за счет хорошо подготовленных данных для обучения, используемых в реальной задаче.

Для чистоты эксперимента, следовало бы проверить связку STT+NLP. Наше решение относится к проверке концепции, с чем успешно справилась