In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification, AdamW, PretrainedConfig
from transformers import get_linear_schedule_with_warmup
import re
from nltk.tokenize import word_tokenize
import nltk
import random
from pymorphy2 import MorphAnalyzer
from torch import nn
import string
import time
import datetime
from collections import Counter
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
data = pd.read_csv('./data/cleancorrectionw.csv',sep=';')
#data = data.drop(columns=['Unnamed: 3', 'Unnamed: 4'])
data = data.dropna()
# data["text"] = data["query"] + '. ' + data["text"]


# queries = list(set(data["query"].values))
# data = data.drop(columns=['query'])
data = data.rename({'class': 'label'}, axis='columns')
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\feodor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\feodor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,text
0,0,7000 руб за 24 часа! Первые деньги тотчас же ...
1,0,"Хочу сделать, что-то новое, но не знаю, что им..."
2,0,Приветствую!✌🏻 Меня зовут Евгения Приглашаю ...
3,0,ВНИМАНИЕ!!! СКИДКА 5000 РУБЛЕЙ НА КУРС ПАРИКМА...
4,0,С целью профилактики преступности в молодежной...


## Clear data

In [3]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"http", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"@\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"#(\w+)", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"[^A-Za-zА-Яа-яё0-9\.\!\?\...]", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"[a-zа-я0-9]+\.[a-zа-я0-9]+\.*[a-zа-я0-9]*", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"id\w+", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"@", "at", regex=True)
    df[text_field] = df[text_field].str.replace(r'\s+', ' ', regex=True)
    return df

clear_data = standardize_text(data.copy(), "text")
clear_data.head()

Unnamed: 0,label,text
0,0,7000 руб за 24 часа! Первые деньги тотчас же ...
1,0,Хочу сделать что то новое но не знаю что именн...
2,0,Приветствую! Меня зовут Евгения Приглашаю Вас ...
3,0,ВНИМАНИЕ!!! СКИДКА 5000 РУБЛЕЙ НА КУРС ПАРИКМА...
4,0,С целью профилактики преступности в молодежной...


In [4]:
# clear_data.text[0]

In [5]:
# data.text[0]

In [6]:
# Counter(y)

In [7]:
# del lat text
no_lat_inds = []
for ind, seq in enumerate(clear_data.text.values):
    is_lat = re.findall(r"[A-Za-z]\w+", seq)
    words = re.findall(r"[А-Яа-я]\w+", seq)
    if len(is_lat) < len(words):
        no_lat_inds.append(ind)

X = clear_data.text.values[no_lat_inds]
y = clear_data.label.values[no_lat_inds]
# y[list(y).index(11)] = 1
# y[list(y).index(4)] = 1

## Лемматизация и токенизация

In [8]:
pymorphy2_analyzer = MorphAnalyzer()
russina_stop_words = stopwords.words('russian')
usa_stop_words = stopwords.words('english')
vord_dict = []
new_X = []
for seq in tqdm(X):
    new_seq = ["[CLS]"]
    for word in word_tokenize(seq):
        if word == '.':
            new_seq.append('[SEP]')
        elif '.' in word or len(re.findall(r'[0-9]+', word)) > 0:
            pass
        elif word not in russina_stop_words and word not in usa_stop_words:
            new_seq.append(pymorphy2_analyzer.parse(word)[0].normal_form)
            
    if new_seq[-1] != '[SEP]':
        new_seq.append('[SEP]')
    new_X.append(new_seq)
    vord_dict += new_seq

100%|████████████████████████████████████████████████████████████████████████████████| 132/132 [00:01<00:00, 67.84it/s]


In [9]:
# with open('vocab.txt', 'w') as f:
#     for item in set(vord_dict):
#         f.write(f'{item}\n')

In [10]:
tokenizer = BertTokenizer.from_pretrained('./vocab.txt')
MAX_LEN = 128
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in new_X],
            maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=3)



In [11]:

attention_masks = []
# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 3, then it's padding, set the mask to 0.
    #   - If a token ID not 3, then it's a real token, set the mask to 1.
    att_mask = [int(token_id != 3) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(input_ids, y,
                                                    random_state=2021, test_size=0.2)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(Counter(y_train), Counter(y_test))

(105, 128) (105,) (27, 128) (27,)
Counter({0: 97, 1: 8}) Counter({0: 26, 1: 1})


In [13]:
train_masks, test_masks, _, _ = train_test_split(np.array(attention_masks), y,
                                             random_state=2018, test_size=0.2)
print(train_masks.shape, test_masks.shape)

(105, 128) (27, 128)


In [14]:
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
y_train = torch.tensor(y_train).type(torch.LongTensor)
y_test = torch.tensor(y_test).type(torch.LongTensor)
train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)

In [15]:
batch_size = 32

train_data = TensorDataset(X_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(X_test, test_masks, y_test)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [16]:
config = BertConfig.from_json_file('./bert_structv2/config.json')
model = BertForSequenceClassification(config)
#model.cuda()


In [17]:
load_path = './bert_structv2'
bert = BertForSequenceClassification.from_pretrained(str(load_path))

In [18]:
params = list(bert.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (54194, 768)
bert.embeddings.position_embeddings.weight                (128, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [19]:
optimizer = AdamW(bert.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 # adam_epsilon
                )

In [20]:
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [21]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
#torch.cuda.manual_seed_all(seed_val)

<torch._C.Generator at 0x2617f7b2e10>

In [23]:
full_train_losses = []
full_val_losses = []

full_train_acc = []
full_val_acc = []

for epoch_i in range(epochs):
    t0 = time.time()
    train_losses = []
    val_losses = []

    train_rates = np.zeros(4)
    val_rates = np.zeros(4)

    train_correct = 0
    val_correct = 0
    
    bert.train()
    
    for step, batch in enumerate(tqdm(train_dataloader, desc='Train')):
          
        b_input_ids = batch[0]#.cuda()
        b_input_mask = batch[1]#.cuda()
        b_labels = batch[2]#.cuda()
    
        
        bert.zero_grad()
        
        outputs = bert(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)

        loss = outputs[0]
        logits = outputs[1]
        
        train_losses.append(loss.item())
        
        train_correct += flat_accuracy(logits.detach().cpu().numpy(), b_labels.detach().cpu().numpy())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(bert.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
#         avg_train_loss = total_loss / len(train_dataloader)
#         loss_values.append(avg_train_loss)
        
        
        t0 = time.time()
        bert.eval()
        
        
    for batch in tqdm(validation_dataloader, desc="Test"):
#         batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = bert(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

        logits = outputs[0]

        val_correct += flat_accuracy(logits.detach().cpu().numpy(), b_labels.detach().cpu().numpy())
            
    full_train_losses.append(np.mean(train_losses))
    full_val_losses.append(np.mean(val_losses))

    full_train_acc.append( (train_correct / len(train_dataloader)))
    full_val_acc.append( (val_correct / len(validation_dataloader)))
        
    print('Epoch : ',epoch_i+1, '\t', 'train_loss :', full_train_losses[-1].item(),
              'train_acc :', full_train_acc[-1].item(), 'val_acc :', full_val_acc[-1].item())
    
    bert.save_pretrained('./bert_structEpoch'+str(epoch_i))



Train: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:49<00:00, 12.31s/it]
Test: 100%|██████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.11s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Epoch :  1 	 train_loss : 0.29578690230846405 train_acc : 0.90625 val_acc : 0.9629629629629629


Train: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:49<00:00, 12.43s/it]
Test: 100%|██████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.22s/it]


Epoch :  2 	 train_loss : 0.2145996242761612 train_acc : 0.9453125 val_acc : 0.9629629629629629


In [24]:
 print('Epoch : ',epoch_i+1, '\t', 'train_loss :', full_train_losses[-1].item(),
              'train_acc :', full_train_acc[-1].item(), 'val_acc :', full_val_acc[-1].item())

Epoch :  2 	 train_loss : 0.2145996242761612 train_acc : 0.9453125 val_acc : 0.9629629629629629
