In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import transformers
from transformers import AutoModel, BertTokenizerFast

In [2]:
import random
random.seed(2020)

In [3]:
device = torch.device("cuda")

### Define Model

In [7]:
# import pretrained model
bert = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased-conversational')

In [5]:
# load bert tokenizer
tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased-conversational')

In [8]:
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

In [9]:
# freeze all the layers of the model before fine-tuning it
for param in bert.parameters():
    param.requires_grad = False

In [10]:
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        
        self.bert = bert
        
        # dropout layer
        self.dropout = nn.Dropout(0.2)
        
        #relu activation
        self.relu = nn.ReLU()
        
        #dense layer 1
        self.fc1 = nn.Linear(768, 512)
        
        #dense layer 2 (Output)
        self.fc2 = nn.Linear(512, 2)
        
        #sigmoid activation
        self.softmax = nn.LogSoftmax(dim=1)
        
    # define forward pass
    def forward(self, sent_id, mask):
        # pass inputs to the model
        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        
        x = self.fc1(cls_hs)
        
        x = self.relu(x)
        
        x = self.dropout(x)
        
        # output layer
        x = self.fc2(x)
        
        #apply softmax
        x = self.softmax(x)
        
        return x

In [11]:
# pass pre-trained BERT to our architecture
model = BERT_Arch(bert)

# pass the model to gpu
model = model.to(device)

### Preprocess test data

In [12]:
import re
import nltk
from nltk.corpus import stopwords

In [13]:
wpt = nltk.WordPunctTokenizer()
stop_words = stopwords.words('russian')

In [14]:
phone_regexp = r'([87](\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}))'
phone_regexp_simple = r'[8\+7]?\d{10,10}'
phone_regexp_text = r'восемь девятьсот|семь девятьсот|восемь девятсот|семь девятсот|плюс семь'
social_regexp = r'вк |vkcomid\d{1,8}|вконтакте|tg |telegram|телеграм|в телегу|телега|тг |discord|дискорд|vkcom|okru|whatsapp|вотсап|ватсап|вайбер|viber|whats app'

In [15]:
def normalize_str(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[:!.,\-]', ' ', doc, re.I|re.A)
    doc = re.sub(r'ё', 'е', doc, re.T|re.A)
    doc = re.sub(r'[^a-zA-ZА-я\s\d]', ' ', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()

    #extract phone number if any
    text_numbers = re.sub(r'[a-zA-ZА-я\s]','', doc, re.I|re.A)    
    contains_phone_number = re.search(phone_regexp, text_numbers)
    
    contains_phone_as_text = re.search(phone_regexp_text, doc)
    phone_trigger = 'PHONE_TRIGGER' if contains_phone_number or contains_phone_as_text else ''    
    
    #extract social links if any
    contains_social_links = re.search(social_regexp, doc)
    social_link_trigger = 'SOCIAL_TRIGGER' if contains_social_links else ''
    
    doc = re.sub(r'[^А-яa-zA-Z\s]', '', doc, re.I|re.A)
    doc = re.sub(r'[a-zA-z\d]{1,3}', '', doc, re.I|re.A)
    doc = re.sub(r' \w ', ' ', doc, re.A|re.I)
    
    #tokenize document
    tokens = wpt.tokenize(doc)
    
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens) + ' ' + phone_trigger + ' ' + social_link_trigger
    return doc

In [21]:
# prepare test data 
test_df = pd.read_csv('val.csv')

test_df['text_data'] = test_df['title'] + ' ' + test_df['description']
test_df = test_df.drop(labels=['title', 'description'], axis=1)

test_df['text_data'] = test_df['text_data'].apply(normalize_str)

In [22]:
# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(test_df['text_data'].tolist(),
                                         max_length = 100,
                                         pad_to_max_length=True,
                                         truncation=True)

In [23]:
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_df['is_bad'].tolist())

In [24]:
batch_size = 32

In [25]:
#wrap tensors
test_data = TensorDataset(test_seq, test_mask, test_y)

#sampler for sampling the data during validation
test_sampler = SequentialSampler(test_data)

#dataloader for validation set
test_dataloader = DataLoader(test_data,
                            sampler=test_sampler,
                            batch_size=batch_size)

In [18]:
#load weights of best model
path = 'saved_weights_v1.2.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [26]:
def predict():
    model.eval()
    
    total_preds = []
    
    # predict
    for batch in test_dataloader:
        batch = [t.to(device) for t in batch]
        
        sent_id, mask, labels = batch
        
        with torch.no_grad():
            preds = model(sent_id, mask)
            
            preds = preds.detach().cpu().numpy()
            
            total_preds.append(preds)
    
    total_preds = np.concatenate(total_preds, axis=0)
       
    return total_preds

In [27]:
predictions = predict()

In [28]:
predictions.T[0].shape

(16237,)

In [32]:
roc_auc_score(test_df['is_bad'], predictions.T[1])

0.8541475033497887

In [33]:
preds = np.argmax(predictions, axis=1)

In [34]:
preds.shape

(16237,)

### Check metrics

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
accuracy_score(test_df['is_bad'], preds)

0.7997166964340703

### Check within categories

In [37]:
from sklearn.metrics import roc_auc_score

In [38]:
test_df.shape

(16237, 8)

In [39]:
categories = test_df['category'].unique()

In [40]:
def test_wrt_categories():
    
    roc_auc_scores = []
    
    for category in categories:
        current_category_entries = test_df[test_df['category'] == category]
        cur_data = current_category_entries['text_data']
        cur_labels = current_category_entries['is_bad']
        
        cur_tokens = tokenizer.batch_encode_plus(cur_data.tolist(),
                                         max_length = 100,
                                         pad_to_max_length=True,
                                         truncation=True)
        
        seq = torch.tensor(cur_tokens['input_ids'])
        mask = torch.tensor(cur_tokens['attention_mask'])
        cur_y = torch.tensor(cur_labels.tolist())
        
        batch_size = 32
        
        #wrap tensors
        data = TensorDataset(seq, mask, cur_y)

        #sampler for sampling the data during validation
        sampler = SequentialSampler(data)

        #dataloader for validation set
        dataloader = DataLoader(data,
                                sampler=sampler,
                                batch_size=batch_size)
        
        model.eval()
    
        cur_preds = []

        # predict
        for batch in dataloader:
            batch = [t.to(device) for t in batch]

            sent_id, mask, labels = batch

            with torch.no_grad():
                preds = model(sent_id, mask)

                preds = preds.detach().cpu().numpy()

                cur_preds.append(preds)

        cur_preds = np.concatenate(cur_preds, axis=0)
        
        cur_roc_auc = roc_auc_score(cur_labels, cur_preds.T[1])
        
        cur_preds_binary = np.argmax(cur_preds, axis=1)
        
        cur_accuracy = accuracy_score(cur_labels, cur_preds_binary)
        
        roc_auc_scores.append(cur_roc_auc)
        print("ROC-AUC for " + str(category) + ": " + str(cur_roc_auc))
        print("Accuracy for " + str(category) + ": " + str(cur_accuracy) + '\n')
    
    mean_roc_auc = np.mean(roc_auc_scores)
    print("Mean ROC-AUC: " + str(mean_roc_auc))

In [41]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

test_wrt_categories()

ROC-AUC for Транспорт: 0.9421223439226712
Accuracy for Транспорт: 0.9092232126614517

ROC-AUC for Для бизнеса: 0.6999836547891468
Accuracy for Для бизнеса: 0.8546712802768166

ROC-AUC for Для дома и дачи: 0.8022317585418708
Accuracy for Для дома и дачи: 0.8169968717413972

ROC-AUC for Личные вещи: 0.7048453671133161
Accuracy for Личные вещи: 0.7767705382436261

ROC-AUC for Услуги: 0.7393583383064193
Accuracy for Услуги: 0.6711309523809523

ROC-AUC for Бытовая электроника: 0.7393918854642314
Accuracy for Бытовая электроника: 0.8712338593974175

ROC-AUC for Недвижимость: 0.721879018535597
Accuracy for Недвижимость: 0.6630620375640296

ROC-AUC for Хобби и отдых: 0.7419159161961644
Accuracy for Хобби и отдых: 0.739021329987453

ROC-AUC for Работа: 0.6802852122195189
Accuracy for Работа: 0.6011029411764706

ROC-AUC for Животные: 0.8177777777777778
Accuracy for Животные: 0.7095238095238096

Mean ROC-AUC: 0.7589791272866713
