In [6]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification, AdamW, PretrainedConfig
from transformers import get_linear_schedule_with_warmup
import re
from nltk.tokenize import word_tokenize
import nltk
import random
from pymorphy2 import MorphAnalyzer
from torch import nn
import string
import time
import datetime
from collections import Counter
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [7]:
nltk.download('stopwords')
nltk.download('punkt')
data = pd.read_csv('./data/cleancorrectionw.csv',sep=';')
#data = data.drop(columns=['Unnamed: 3', 'Unnamed: 4'])
data = data.dropna()
# data["text"] = data["query"] + '. ' + data["text"]


# queries = list(set(data["query"].values))
# data = data.drop(columns=['query'])
data = data.rename({'class': 'label'}, axis='columns')
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\feodor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\feodor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,text
0,0,7000 —Ä—É–± –∑–∞ 24 —á–∞—Å–∞! –ü–µ—Ä–≤—ã–µ –¥–µ–Ω—å–≥–∏ —Ç–æ—Ç—á–∞—Å –∂–µ ...
1,0,"–•–æ—á—É —Å–¥–µ–ª–∞—Ç—å, —á—Ç–æ-—Ç–æ –Ω–æ–≤–æ–µ, –Ω–æ –Ω–µ –∑–Ω–∞—é, —á—Ç–æ –∏–º..."
2,0,–ü—Ä–∏–≤–µ—Ç—Å—Ç–≤—É—é!‚úåüèª –ú–µ–Ω—è –∑–æ–≤—É—Ç –ï–≤–≥–µ–Ω–∏—è –ü—Ä–∏–≥–ª–∞—à–∞—é ...
3,0,–í–ù–ò–ú–ê–ù–ò–ï!!! –°–ö–ò–î–ö–ê 5000 –†–£–ë–õ–ï–ô –ù–ê –ö–£–†–° –ü–ê–†–ò–ö–ú–ê...
4,0,–° —Ü–µ–ª—å—é –ø—Ä–æ—Ñ–∏–ª–∞–∫—Ç–∏–∫–∏ –ø—Ä–µ—Å—Ç—É–ø–Ω–æ—Å—Ç–∏ –≤ –º–æ–ª–æ–¥–µ–∂–Ω–æ–π...


## Clear data

In [8]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"http", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"@\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"#(\w+)", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z–ê-–Ø–∞-—è—ë0-9\.\!\?\...]", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"[a-z–∞-—è0-9]+\.[a-z–∞-—è0-9]+\.*[a-z–∞-—è0-9]*", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"id\w+", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"@", "at", regex=True)
    df[text_field] = df[text_field].str.replace(r'\s+', ' ', regex=True)
    return df

clear_data = standardize_text(data.copy(), "text")
clear_data.head()

Unnamed: 0,label,text
0,0,7000 —Ä—É–± –∑–∞ 24 —á–∞—Å–∞! –ü–µ—Ä–≤—ã–µ –¥–µ–Ω—å–≥–∏ —Ç–æ—Ç—á–∞—Å –∂–µ ...
1,0,–•–æ—á—É —Å–¥–µ–ª–∞—Ç—å —á—Ç–æ —Ç–æ –Ω–æ–≤–æ–µ –Ω–æ –Ω–µ –∑–Ω–∞—é —á—Ç–æ –∏–º–µ–Ω–Ω...
2,0,–ü—Ä–∏–≤–µ—Ç—Å—Ç–≤—É—é! –ú–µ–Ω—è –∑–æ–≤—É—Ç –ï–≤–≥–µ–Ω–∏—è –ü—Ä–∏–≥–ª–∞—à–∞—é –í–∞—Å ...
3,0,–í–ù–ò–ú–ê–ù–ò–ï!!! –°–ö–ò–î–ö–ê 5000 –†–£–ë–õ–ï–ô –ù–ê –ö–£–†–° –ü–ê–†–ò–ö–ú–ê...
4,0,–° —Ü–µ–ª—å—é –ø—Ä–æ—Ñ–∏–ª–∞–∫—Ç–∏–∫–∏ –ø—Ä–µ—Å—Ç—É–ø–Ω–æ—Å—Ç–∏ –≤ –º–æ–ª–æ–¥–µ–∂–Ω–æ–π...


In [9]:
clear_data.text[0]

' 7000 —Ä—É–± –∑–∞ 24 —á–∞—Å–∞! –ü–µ—Ä–≤—ã–µ –¥–µ–Ω—å–≥–∏ —Ç–æ—Ç—á–∞—Å –∂–µ –ø–æ –æ–∫–æ–Ω—á–∞–Ω–∏–∏ —Å—Ç–∞—Ä—Ç–∞ —Å–∏—Å—Ç–µ–º—ã –†–∞–∑—Ä–µ—à–∏—Ç–µ –º–Ω–µ –≤—Å–µ–≥–æ –ª–∏—à—å –¥–≤–µ –º–∏–Ω—É—Ç–∫–∏ –∏ —è –ø—Ä–æ–¥–µ–º–æ–Ω—Å—Ç—Ä–∏—Ä—É—é –∫–∞–∫ –í—ã —Å—É–º–µ–µ—Ç–µ —É—Å–ø–µ—à–Ω–æ –∑–∞—Ä–∞–±–æ—Ç–∞—Ç—å –æ—Ç 7 000 —Ä—É–± —É–∂–µ –≤ –±–ª–∏–∂–∞–π—à–µ–µ –≤—Ä–µ–º—è. –ó–∞—Ä–∞–±–æ—Ç–∞—Ç—å –ø–æ—Ç—Ä–∞—Ç–∏–≤ –Ω–∞ –≤—Å–µ —ç—Ç–æ –Ω–µ —Å–≤—ã—à–µ 40 –º–∏–Ω –ª–∏—á–Ω–æ–≥–æ –≤—Ä–µ–º–µ–Ω–∏! —Å–µ–º—å –¢—ã—â –∑–∞ 24 —á–∞—Å–∞? –ù–µ—Ä–µ–∞–ª—å–Ω–æ ! –¢—ã —Ç–∞–∫ —Å—á–∏—Ç–∞–µ—à—å? –ù–µ —Å–ø–µ—à–∏ —Å –æ—Ç–≤–µ—Ç–∞–º–∏. –¢–æ —á—Ç–æ –≤ –ò–Ω—Ç–µ—Ä–Ω–µ—Ç–µ –Ω–µ–ª—å–∑—è –∑–∞–∫–æ–ª–æ—Ç–∏—Ç—å –¥–æ–≤–æ–ª—å–Ω–æ –±–æ–ª—å—à–∏–µ —Å—Ä–µ–¥—Å—Ç–≤–∞ –º–∏—Ñ –∫–æ—Ç–æ—Ä—ã–π —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω—è—é—Ç –ª–µ–Ω—Ç—è–∏ –ª–∏–±–æ –Ω–µ—É–¥–∞—á–Ω–∏–∫–∏. –ï—Å–ª–∏ —É–∂ —Ç—ã —Ç–æ—á–Ω–æ —Ö–æ—á–µ—à—å –∑–∞–Ω—è—Ç—å—Å—è —á–∞—Å—Ç–Ω—ã–º –±–∏–∑–Ω–µ—Å–æ–º –∫–æ–Ω—á–∞—Ç—å –≥–æ—Ä–±–∏—Ç—å —Å–ø–∏–Ω—É –Ω–∞ –¥—è–¥—é –æ—Å–≤–æ–±–æ–¥–∏—Ç—Å—è –æ—Ç –¥–æ–ª–∂–∫–æ–≤ –ø–æ–∑–

In [10]:
data.text[0]

' 7000 —Ä—É–± –∑–∞ 24 —á–∞—Å–∞! –ü–µ—Ä–≤—ã–µ –¥–µ–Ω—å–≥–∏ —Ç–æ—Ç—á–∞—Å –∂–µ –ø–æ –æ–∫–æ–Ω—á–∞–Ω–∏–∏ —Å—Ç–∞—Ä—Ç–∞ —Å–∏—Å—Ç–µ–º—ã –†–∞–∑—Ä–µ—à–∏—Ç–µ –º–Ω–µ –≤—Å–µ–≥–æ-–ª–∏—à—å –¥–≤–µ –º–∏–Ω—É—Ç–∫–∏, –∏ —è –ø—Ä–æ–¥–µ–º–æ–Ω—Å—Ç—Ä–∏—Ä—É—é –∫–∞–∫ –í—ã —Å—É–º–µ–µ—Ç–µ —É—Å–ø–µ—à–Ω–æ –∑–∞—Ä–∞–±–æ—Ç–∞—Ç—å –æ—Ç 7 000 —Ä—É–± —É–∂–µ –≤ –±–ª–∏–∂–∞–π—à–µ–µ –≤—Ä–µ–º—è.  –ó–∞—Ä–∞–±–æ—Ç–∞—Ç—å, –ø–æ—Ç—Ä–∞—Ç–∏–≤ –Ω–∞ –≤—Å–µ —ç—Ç–æ –Ω–µ —Å–≤—ã—à–µ 40 –º–∏–Ω –ª–∏—á–Ω–æ–≥–æ –≤—Ä–µ–º–µ–Ω–∏! ¬´—Å–µ–º—å –¢—ã—â –∑–∞ 24 —á–∞—Å–∞? –ù–µ—Ä–µ–∞–ª—å–Ω–æ¬ª! –¢—ã —Ç–∞–∫ —Å—á–∏—Ç–∞–µ—à—å? –ù–µ —Å–ø–µ—à–∏ —Å –æ—Ç–≤–µ—Ç–∞–º–∏. –¢–æ, —á—Ç–æ –≤ –ò–Ω—Ç–µ—Ä–Ω–µ—Ç–µ –Ω–µ–ª—å–∑—è –∑–∞–∫–æ–ª–æ—Ç–∏—Ç—å –¥–æ–≤–æ–ª—å–Ω–æ –±–æ–ª—å—à–∏–µ —Å—Ä–µ–¥—Å—Ç–≤–∞ ‚Äì –º–∏—Ñ, –∫–æ—Ç–æ—Ä—ã–π —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω—è—é—Ç –ª–µ–Ω—Ç—è–∏ –ª–∏–±–æ –Ω–µ—É–¥–∞—á–Ω–∏–∫–∏. –ï—Å–ª–∏ —É–∂ —Ç—ã —Ç–æ—á–Ω–æ —Ö–æ—á–µ—à—å: - –∑–∞–Ω—è—Ç—å—Å—è —á–∞—Å—Ç–Ω—ã–º –±–∏–∑–Ω–µ—Å–æ–º - –∫–æ–Ω—á–∞—Ç—å –≥–æ—Ä–±–∏—Ç—å —Å–ø–∏–Ω—É –Ω–∞ ¬´–¥—è–¥—é¬ª - –æ—Å–≤–æ–±–æ–¥–∏—Ç—Å—è –æ—Ç

In [11]:
Counter(y)

NameError: name 'y' is not defined

In [12]:
# del lat text
no_lat_inds = []
for ind, seq in enumerate(clear_data.text.values):
    is_lat = re.findall(r"[A-Za-z]\w+", seq)
    words = re.findall(r"[–ê-–Ø–∞-—è]\w+", seq)
    if len(is_lat) < len(words):
        no_lat_inds.append(ind)

X = clear_data.text.values[no_lat_inds]
y = clear_data.label.values[no_lat_inds]
# y[list(y).index(11)] = 1
# y[list(y).index(4)] = 1

## –õ–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è

In [13]:
pymorphy2_analyzer = MorphAnalyzer()
russina_stop_words = stopwords.words('russian')
usa_stop_words = stopwords.words('english')
vord_dict = []
new_X = []
for seq in tqdm(X):
    new_seq = ["[CLS]"]
    for word in word_tokenize(seq):
        if word == '.':
            new_seq.append('[SEP]')
        elif '.' in word or len(re.findall(r'[0-9]+', word)) > 0:
            pass
        elif word not in russina_stop_words and word not in usa_stop_words:
            new_seq.append(pymorphy2_analyzer.parse(word)[0].normal_form)
            
    if new_seq[-1] != '[SEP]':
        new_seq.append('[SEP]')
    new_X.append(new_seq)
    vord_dict += new_seq

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 132/132 [00:01<00:00, 69.08it/s]


In [9]:
with open('vocab.txt', 'w') as f:
    for item in set(vord_dict):
        f.write(f'{item}\n')

In [14]:
tokenizer = BertTokenizer.from_pretrained('./vocab.txt')
MAX_LEN = 128
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in new_X],
            maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=3)



In [32]:

attention_masks = []
# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 3, then it's padding, set the mask to 0.
    #   - If a token ID not 3, then it's a real token, set the mask to 1.
    att_mask = [int(token_id != 3) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(input_ids, y,
                                                    random_state=2021, test_size=0.2)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(Counter(y_train), Counter(y_test))

(105, 128) (105,) (27, 128) (27,)
Counter({0: 97, 1: 8}) Counter({0: 26, 1: 1})


In [34]:
train_masks, test_masks, _, _ = train_test_split(np.array(attention_masks), y,
                                             random_state=2018, test_size=0.2)
print(train_masks.shape, test_masks.shape)

(105, 128) (27, 128)


In [35]:
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
y_train = torch.tensor(y_train).type(torch.LongTensor)
y_test = torch.tensor(y_test).type(torch.LongTensor)
train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)

In [36]:
batch_size = 32

train_data = TensorDataset(X_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(X_test, test_masks, y_test)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [37]:
config = BertConfig.from_json_file('./bert_structv2/config.json')
model = BertForSequenceClassification(config)
#model.cuda()


In [38]:
load_path = './bert_structv2'
bert = BertForSequenceClassification.from_pretrained(str(load_path))

In [39]:
params = list(bert.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (54194, 768)
bert.embeddings.position_embeddings.weight                (128, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [40]:
optimizer = AdamW(bert.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 # adam_epsilon
                )

In [41]:
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [42]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [43]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
#torch.cuda.manual_seed_all(seed_val)

<torch._C.Generator at 0x1597d79b350>

In [44]:
full_train_losses = []
full_val_losses = []

full_train_acc = []
full_val_acc = []

for epoch_i in range(epochs):
    t0 = time.time()
    train_losses = []
    val_losses = []

    train_rates = np.zeros(4)
    val_rates = np.zeros(4)

    train_correct = 0
    val_correct = 0
    
    bert.train()
    
    for step, batch in enumerate(tqdm(train_dataloader, desc='Train')):
          
        b_input_ids = batch[0]#.cuda()
        b_input_mask = batch[1]#.cuda()
        b_labels = batch[2]#.cuda()
    
        
        bert.zero_grad()
        
        outputs = bert(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)

        loss = outputs[0]
        logits = outputs[1]
        
        train_losses.append(loss.item())
        
        train_correct += flat_accuracy(logits.detach().cpu().numpy(), b_labels.detach().cpu().numpy())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(bert.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
#         avg_train_loss = total_loss / len(train_dataloader)
#         loss_values.append(avg_train_loss)
        
        
        t0 = time.time()
        bert.eval()
        
        
    for batch in tqdm(validation_dataloader, desc="Test"):
#         batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = bert(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

        logits = outputs[0]

        val_correct += flat_accuracy(logits.detach().cpu().numpy(), b_labels.detach().cpu().numpy())
            
    full_train_losses.append(np.mean(train_losses))
    full_val_losses.append(np.mean(val_losses))

    full_train_acc.append( (train_correct / len(train_dataloader)))
    full_val_acc.append( (val_correct / len(validation_dataloader)))
        
    print('Epoch : ',epoch_i+1, '\t', 'train_loss :', full_train_losses[-1].item(),
              'train_acc :', full_train_acc[-1].item(), 'val_acc :', full_val_acc[-1].item())
    
    bert.save_pretrained('./bert_structEpoch'+str(epoch_i))



Train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:48<00:00, 12.17s/it]
Test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:04<00:00,  4.01s/it]


Epoch :  1 	 train_loss : 0.29578690230846405 train_acc : 0.90625 val_acc : 0.9629629629629629


Train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:48<00:00, 12.04s/it]
Test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:03<00:00,  3.95s/it]


Epoch :  2 	 train_loss : 0.2145996242761612 train_acc : 0.9453125 val_acc : 0.9629629629629629


In [45]:
 print('Epoch : ',epoch_i+1, '\t', 'train_loss :', full_train_losses[-1].item(),
              'train_acc :', full_train_acc[-1].item(), 'val_acc :', full_val_acc[-1].item())

Epoch :  2 	 train_loss : 0.2145996242761612 train_acc : 0.9453125 val_acc : 0.9629629629629629


In [None]:
full_train_losses = []
full_val_losses = []

full_train_acc = []
full_val_acc = []

for epoch_i in range(epochs):
    t0 = time.time()
    train_losses = []
    val_losses = []

    train_rates = np.zeros(4)
    val_rates = np.zeros(4)

    train_correct = 0
    val_correct = 0
    
    model.train()
    
    for step, batch in enumerate(tqdm(train_dataloader, desc='Train')):
          
        b_input_ids = batch[0]#.cuda()
        b_input_mask = batch[1]#.cuda()
        b_labels = batch[2]#.cuda()
    
        
        model.zero_grad()
        
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)

        loss = outputs[0]
        logits = outputs[1]
        
        train_losses.append(loss.item())
        
        train_correct += flat_accuracy(logits.detach().cpu().numpy(), b_labels.detach().cpu().numpy())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
#         avg_train_loss = total_loss / len(train_dataloader)
#         loss_values.append(avg_train_loss)
        
        
        t0 = time.time()
        model.eval()
        
        
    for batch in tqdm(validation_dataloader, desc="Test"):
#         batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

        logits = outputs[0]

        val_correct += flat_accuracy(logits.detach().cpu().numpy(), b_labels.detach().cpu().numpy())
            
    full_train_losses.append(np.mean(train_losses))
    full_val_losses.append(np.mean(val_losses))

    full_train_acc.append( (train_correct / len(train_dataloader)))
    full_val_acc.append( (val_correct / len(validation_dataloader)))
        
    print('Epoch : ',epoch_i+1, '\t', 'train_loss :', full_train_losses[-1].item(),
              'train_acc :', full_train_acc[-1].item(), 'val_acc :', full_val_acc[-1].item())




Train:  10%|‚ñâ         | 11/115 [05:17<48:57, 28.25s/it]

In [36]:
bert.save_pretrained('./bert_structv3/')

In [None]:
model.

In [23]:
load_path = './bert_struct'
bert = BertForSequenceClassification.from_pretrained(str(load_path))

In [24]:
bert

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(54348, 768, padding_idx=3)
      (position_embeddings): Embedding(128, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
stopwords

<WordListCorpusReader in '/Users/feodor/nltk_data/corpora/stopwords'>