In [31]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import torch
import torch.nn as nn
import pandas as pd
import glob
import numpy as np
from matplotlib import pyplot as plt

In [32]:
df_train = pd.read_parquet('dataset/train_dataset_train_full_proc.parquet')
df_test = pd.read_parquet('dataset/test_dataset_test_full_proc.parquet')

In [33]:
embedding_categories = OneHotEncoder(handle_unknown='ignore')
df_train['embedding_category'] = embedding_categories.fit_transform(df_train[['category']]).todense().tolist()
df_test['embedding_category'] = embedding_categories.transform(df_test[['category']]).todense().tolist()

In [34]:
with open('utils/embedding_category.pickle', 'wb') as handle:
    pickle.dump(embedding_categories, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
embedding_document_id_1 = OneHotEncoder(handle_unknown='ignore')
df_train['embedding_document_id_1'] = embedding_document_id_1.fit_transform(df_train[['document_id_1']]).todense().tolist()
df_test['embedding_document_id_1'] = embedding_document_id_1.transform(df_test[['document_id_1']]).todense().tolist()


In [36]:
with open('utils/embedding_document_id_1.pickle', 'wb') as handle:
    pickle.dump(embedding_document_id_1, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [37]:
embedding_document_id_2 = OneHotEncoder(handle_unknown='ignore')
df_train['embedding_document_id_2'] = embedding_document_id_2.fit_transform(df_train[['document_id_2']]).todense().tolist()
df_test['embedding_document_id_2'] = embedding_document_id_2.transform(df_test[['document_id_2']]).todense().tolist()


In [38]:
with open('utils/embedding_document_id_2 .pickle', 'wb') as handle:
    pickle.dump(embedding_document_id_2 , handle, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
embedding_document_id_3 = OneHotEncoder(handle_unknown='ignore')
df_train['embedding_document_id_3'] = embedding_document_id_3.fit_transform(df_train[['document_id_3']]).todense().tolist()
df_test['embedding_document_id_3'] = embedding_document_id_3.transform(df_test[['document_id_3']]).todense().tolist()


In [40]:
with open('utils/embedding_document_id_3', 'wb') as handle:
    pickle.dump(embedding_document_id_3, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
embedding_document_id_4 = OneHotEncoder(handle_unknown='ignore')
df_train['embedding_document_id_4'] = embedding_document_id_4.fit_transform(df_train[['document_id_4']]).todense().tolist()
df_test['embedding_document_id_4'] = embedding_document_id_4.transform(df_test[['document_id_4']]).todense().tolist()


In [42]:
with open('utils/embedding_document_id_4.pickle', 'wb') as handle:
    pickle.dump(embedding_document_id_4, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
embedding_document_id_5 = OneHotEncoder(handle_unknown='ignore')
df_train['embedding_document_id_5'] = embedding_document_id_5.fit_transform(df_train[['document_id_5']]).todense().tolist()
df_test['embedding_document_id_5'] = embedding_document_id_5.transform(df_test[['document_id_5']]).todense().tolist()


In [44]:
with open('utils/embedding_document_id_5.pickle', 'wb') as handle:
    pickle.dump(embedding_document_id_5, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
vectorizer_tag = CountVectorizer(min_df=12)
vectorizer_tag.fit(df_train['tags'].tolist())
df_train['embedding_tags'] = df_train['tags'].apply(lambda x: vectorizer_tag.transform([x]).todense().tolist()[0])
df_test['embedding_tags'] = df_test['tags'].apply(lambda x: vectorizer_tag.transform([x]).todense().tolist()[0])

In [46]:
with open('utils/embedding_tags_vectorize.pickle', 'wb') as handle:
    pickle.dump(vectorizer_tag, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [47]:
vectorizer_author = CountVectorizer(min_df=12)
vectorizer_author.fit(df_train['authors'].tolist())
df_train['embedding_authors'] = df_train['authors'].apply(lambda x: vectorizer_author.transform([x]).todense().tolist()[0])
df_test['embedding_authors'] = df_test['authors'].apply(lambda x: vectorizer_author.transform([x]).todense().tolist()[0])


In [48]:
with open('utils/embedding_authors_vectorize.pickle', 'wb') as handle:
    pickle.dump(vectorizer_author, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [49]:
vectorizer_titles = CountVectorizer(min_df=12)
df_train['embedding_title'] = vectorizer_titles.fit_transform(df_train['title'].tolist()).todense().tolist()
df_test['embedding_title'] = vectorizer_titles.transform(df_test['title'].tolist()).todense().tolist()

In [50]:
with open('utils/embedding_titles_vectorize.pickle', 'wb') as handle:
    pickle.dump(vectorizer_titles, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [51]:
vectorizer_fulltext = CountVectorizer(min_df=200)
df_train['embedding_full_text'] = vectorizer_fulltext.fit_transform(df_train['full_text'].tolist()).todense().tolist()
df_test['embedding_full_text'] = vectorizer_fulltext.transform(df_test['full_text'].tolist()).todense().tolist()


In [52]:
with open('utils/embedding_fulltext_vectorize.pickle', 'wb') as handle:
    pickle.dump(vectorizer_fulltext, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [53]:
class CFG:
    model = "DeepPavlov/rubert-base-cased"
    max_len = 512
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [54]:
class RBKDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(f"{CFG.model}")
        self.df = df        
        self.inference_only = inference_only
        self.text = df.full_text_original.tolist()
        
        if not self.inference_only:
            self.views = torch.tensor(df.views, dtype=torch.float32)  
            self.depth = torch.tensor(df.depth, dtype=torch.float32) 
            self.full_reads_percent = torch.tensor(df.full_reads_percent, dtype=torch.float32)  
    
        self.encoded = self.tokenizer.batch_encode_plus(
            self.text, 
            padding = 'max_length', 
            max_length = CFG.max_len,
            add_special_tokens=True,
            truncation = True,
            return_attention_mask = True
        )
   
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }           
        else:
            views = self.views[index]
            depth = self.depth[index]
            full_reads_percent = self.full_reads_percent[index]
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'target': torch.stack([views, depth, full_reads_percent], dim=0)
            }

In [55]:
class RBKModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(f"{CFG.model}")
        self.rubert = AutoModel.from_pretrained(f"{CFG.model}", config=self.config)  
        self.attention = nn.Sequential(            
                                            nn.Linear(768, 128),            
                                            nn.Tanh(),                       
                                            nn.Linear(128, 1),
                                            nn.Softmax(dim=1)
                                        )
  
        self.fc1 = nn.Linear(self.config.hidden_size, 100)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(100, 3)
    
    def forward(self, input_ids, attention_mask):
        rubert_output = self.rubert(input_ids=input_ids,
                                      attention_mask=attention_mask)
        weights = self.attention(rubert_output['last_hidden_state'])
        context_vector = torch.sum(weights * rubert_output['last_hidden_state'], dim=1)
        fc1 = self.fc1(context_vector)
        logits = self.fc2(self.relu(self.dropout(fc1)))
        return {'logits': logits,
               'context_vector': fc1}

In [56]:
def get_predict_BERT(df):
    loader = DataLoader(
        RBKDataset(df.reset_index(), inference_only=True), 
        batch_size=1, 
        shuffle=False
    )
    prediction = np.zeros((4, len(df), 100))
    for number_model, model_path in enumerate(glob.glob('models_bert/*.pth')):
        model = RBKModel().to(CFG.device)
        model.load_state_dict(torch.load(f"{model_path}"))
        model.eval()
        for index, batch in enumerate(loader):
            pred = model(
                batch['input_ids'].to(CFG.device), 
                batch['attention_mask'].to(CFG.device)
            )['context_vector']
            prediction[number_model, index, :] = pred.cpu().detach().numpy()
    return np.mean(prediction, axis=0)

In [57]:
%%time
df_train['bert_embedding'] = get_predict_BERT(df_train).tolist()

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: 

CPU times: user 6min 45s, sys: 1.44 s, total: 6min 46s
Wall time: 6min 47s


In [58]:
len(df_train['bert_embedding'].iloc[0])

100

In [59]:
%time
df_test['bert_embedding'] = get_predict_BERT(df_test).tolist()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: 

In [60]:
df_train.to_parquet('dataset/train_dataset_train_full_proc.parquet')
df_test.to_parquet('dataset/test_dataset_test_full_proc.parquet')