# Проект 3. Решить задачу DaNetQA / BoolQ

Можно решить как задачу для русского, так и для английского.

Либо провести эксперименты с многоязычной моделью

https://russiansuperglue.com/ru/tasks/task_info/DaNetQA

## Описание
Причинно-следственная связь, логический вывод, Natural Language Inference

DaNetQA - это набор да/нет вопросов с ответами и фрагментом текста, содержащим ответ. Все вопросы были написаны авторами без каких-либо искусственных ограничений.

Каждый пример представляет собой триплет (вопрос, фрагмент текста, ответ) с заголовком страницы в качестве необязательного дополнительного контекста.

Настройка классификации текстовых пар аналогична существующим задачам логического вывода (NLI)

### Тип задачи
Логика, Commonsense, Знания о мире. Бинарная классификация: true/false

## Подготовка данных

### Imports

In [1]:
import pandas as pd
import unicodedata
import numpy as np

import nltk
from nltk.stem.snowball import SnowballStemmer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leysh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Загрузка данных

In [3]:
def loadJSONL(path, name):
    df = pd.read_json(path, lines=True)
    #df = df.set_index('idx')
    print(name)
    display(df.head())
    if (df.columns.values == 'label').any():
        s = np.unique(df['label'].to_numpy(), return_counts=True)[1]
        print(f"True answer: {s[1]}")
        print(f"False answer: {s[0]}")
        print("")
    return df

In [4]:
df_train = loadJSONL("DaNetQA/raw_train.jsonl", "Train set")
df_validation = loadJSONL("DaNetQA/raw_val.jsonl", "Validation set")
df_test = loadJSONL("DaNetQA/raw_test.jsonl", "Test set:")

Train set


Unnamed: 0,question,passage,label,idx
0,Вднх - это выставочный центр?,«Вы́ставочный центр» — станция Московского мон...,True,0
1,Вднх - это выставочный центр?,"Вы́ставка достиже́ний наро́дного хозя́йства ,...",True,1
2,Был ли джиган в black star?,Вместе с этим треком они выступили на церемони...,True,2
3,Xiaomi конкурент apple?,"Xiaomi — китайская компания, основанная в 2010...",True,3
4,Был ли автомат калашникова в вов?,Отметив некоторые недостатки и в целом удачную...,False,4


True answer: 1061
False answer: 688

Validation set


Unnamed: 0,question,passage,label,idx
0,Есть ли вода на марсе?,Гидросфера Марса — это совокупность водных зап...,True,0
1,Состоит ли англия в евросоюзе?,В полночь с 31 января на 1 февраля 2020 года п...,False,1
2,Действительно ли в ссср не было адвокатов?,Семён Львович Ария — советский и российский ю...,False,2
3,Была ли чума в оране?,"Чума — это и абсурд, что осмысливается как фор...",True,3
4,Был ли кетчуп в читосе?,Текущий каталог продукции размещен на сайте пр...,True,4


True answer: 412
False answer: 409

Test set:


Unnamed: 0,question,passage,idx
0,Полезна ли ртуть с градусника?,"Отравления ртутью — расстройства здоровья, св...",0
1,Являются ли сапрофаги хищниками?,Фауна лесных почв — совокупность видов животны...,1
2,Водятся ли в индии крокодилы?,"Болотный крокодил, или магер — пресмыкающееся...",2
3,Есть ли в батате крахмал?,"Клубневидно вздутые корни весят до 15 кг, сод...",3
4,Был ли человек в железной маске?,Остров Сент-Маргерит — крупнейший из Лерински...,4


### Очистка данных

In [18]:
class DataCleaner:
    def __init__(self) -> None:
        self.flag_verbose = True

        self.stop_words = stopwords.words('russian')
        self.stemmer = SnowballStemmer('russian')

        self.count_removed_symbols = dict()
        self.count_removed_words = dict()

        self.count_replaced_symbols = dict()
        self.dict_replaced_symbols = dict()

        self.count_replaced_words = dict()
        self.dict_replaced_words = dict()

        self.char_to_remove = ['«', '»', '—', ',', '.', '-', '/', ':', '!', "?", "(", ")", "{", "}", "[", "]", "@", "#", "$", "%", "^", "&", "*", "=", "|", "\\", ">", "<"]
        self.char_to_replace = [['ё', 'е']]

    # функция подсчета количества измененных слов
    def addReplacedWord(self, s_from, s_to = ' '):
        if not self.count_replaced_words.keys().__contains__(s_from):
            self.count_replaced_words[s_from] = 0
        self.count_replaced_words[s_from] += 1
        self.dict_replaced_words[s_from] = s_to

    # функция подсчета количества удаленных слов
    def addRemovedWord(self, w):
        if w == ' ':
            if not self.count_removed_symbols.keys().__contains__(w):
                self.count_removed_symbols[w] = 0
            self.count_removed_symbols[w] += 1

    # функция подсчета количества удаленных символов
    def addReplacedSymbol(self, s_from, s_to = ' '):
        if s_to == ' ':
            if not self.count_removed_symbols.keys().__contains__(s_from):
                self.count_removed_symbols[s_from] = 0
            self.count_removed_symbols[s_from] += 1
        else:
            if not self.count_replaced_symbols.keys().__contains__(s_from):
                self.count_replaced_symbols[s_from] = 0
            self.count_replaced_symbols[s_from] += 1
            self.dict_replaced_symbols[s_from] = s_to

    # удаление знаков ударения и прочих символов unicode
    def unicodeToAscii(self, s):
        tmp = []
        for c in unicodedata.normalize('NFD', s):
            if unicodedata.category(c) != 'Mn':
                tmp.append(c)
            else:
                self.addReplacedSymbol(c)
        return ''.join(tmp)

    # если нужно удалить, то заменяем на пробел чтоб не потерят разделения слов
    def replaceChar(self, s):
        tmp = []
        for i, c in enumerate(s):
            if self.char_to_remove.__contains__(c):
                self.addReplacedSymbol(c, s[i])
                tmp.append(' ')
            else:
                tmp.append(c)
        s = "".join(tmp)

        for s_from, s_to in self.char_to_replace:
            if c == s_from:
                s[i] = s_to
                self.addReplacedSymbol(s_from, s_to)
        return s

    # удаляем лишние пробелы
    def trimSpaces(self, s):
        while s.__contains__('  '):
            s = s.replace('  ', ' ')
        s = s.strip()
        return s

    # удаляем слва из stopwords
    def removeStopWords(self, s):
        tmp = []
        for word in word_tokenize(s):
            if word not in self.stop_words:
                tmp.append(word)
            else:
                self.addRemovedWord(word)
        return " ".join(tmp)

    # удаляем слва из stopwords
    def StemmWords(self, s):
        tmp = []
        for word in word_tokenize(s):
            wordStemmed = self.stemmer.stem(word)
            tmp.append(wordStemmed)
            if word != wordStemmed:
                self.addReplacedWord(word, wordStemmed)
        return " ".join(tmp)

    def clean(self, df, column):
        for i in range(len(df)):
            df[column][i] = self.unicodeToAscii(df[column][i])
            df[column][i] = df[column][i].lower()
            df[column][i] = self.replaceChar(df[column][i])
            df[column][i] = self.removeStopWords(df[column][i])
            df[column][i] = self.StemmWords(df[column][i])
            df[column][i] = self.trimSpaces(df[column][i])
        return df

    # прокси для выключения вывода на экран summary
    def print(self, vals):
        if self.flag_verbose == True:
            print(vals)

    # прокси для выключения вывода на экран summary
    def display(self, vals):
            if self.flag_verbose == True:
                display(vals)

    # сбор лога в dataframe, опциональный вывод на экран 
    def summary(self, verbose = True):
        self.flag_verbose = verbose
        dfs = []

        self.print("===================================")
        self.print("===        Removed Chars        ===")
        self.print("===================================")
        
        cols = ["symbol", "count_removed"]
        dfRemoved = pd.DataFrame(columns=cols)
        for c in self.count_removed_symbols:
            current_df = pd.DataFrame([[c, self.count_removed_symbols[c]]], columns=cols) 
            dfRemoved = pd.concat([dfRemoved, current_df], ignore_index=True)
        self.display(dfRemoved)
        dfs.append(['Removed Chars', dfRemoved])

        self.print("===================================")
        self.print("===        Removed Words        ===")
        self.print("===================================")
        
        cols = ["word", "count_removed"]
        dfRemoved = pd.DataFrame(columns=cols)
        for c in self.count_removed_words:
            current_df = pd.DataFrame([[c, self.count_removed_words[c]]], columns=cols) 
            dfRemoved = pd.concat([dfRemoved, current_df], ignore_index=True)
        self.display(dfRemoved)
        dfs.append(['Removed Words', dfRemoved])

        self.print("===================================")
        self.print("===        Replaced Chars       ===")
        self.print("===================================")
        
        cols = ["symbol_from", "symbol_to", "count_replaced"]
        dfRemoved = pd.DataFrame(columns=cols)
        for c in self.dict_replaced_symbols:
            current_df = pd.DataFrame([[ c, self.dict_replaced_symbols[c], self.count_replaced_symbols[c]]], columns=cols) 
            dfRemoved = pd.concat([dfRemoved, current_df], ignore_index=True)
        self.display(dfRemoved)
        dfs.append(['Replaced Chars', dfRemoved])

        self.print("===================================")
        self.print("===        Stemmed Words        ===")
        self.print("===================================")
        
        cols = ["word_from", "word_to", "count_replaced"]
        dfRemoved = pd.DataFrame(columns=cols)
        for c in self.dict_replaced_words:
            current_df = pd.DataFrame([[ c, self.dict_replaced_words[c], self.count_replaced_words[c]]], columns=cols) 
            dfRemoved = pd.concat([dfRemoved, current_df], ignore_index=True)
        self.display(dfRemoved)
        dfs.append(['Stemmed Words', dfRemoved])

        return dfs

In [19]:
t = DataCleaner()
df_train = t.clean(df_train, 'passage')
df_test = t.clean(df_test, 'passage')
df_validation = t.clean(df_validation, 'passage')
df_train = t.clean(df_train, 'question')
df_test = t.clean(df_test, 'question')
df_validation = t.clean(df_validation, 'question')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i] = self.unicodeToAscii(df[column][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i] = df[column][i].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i] = self.replaceChar(df[column][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i] = self.removeStop

In [20]:
dfs = t.summary()

===        Removed Chars        ===


Unnamed: 0,symbol,count_removed


===        Removed Words        ===


Unnamed: 0,word,count_removed


===        Replaced Chars       ===


Unnamed: 0,symbol_from,symbol_to,count_replaced


===        Stemmed Words        ===


Unnamed: 0,word_from,word_to,count_replaced
0,компримирова,компримиров,3
1,бута,бут,2
2,использов,использ,123
3,переоборудов,переоборуд,2
4,оборудов,оборуд,16
...,...,...,...
681,распостра,распостр,1
682,всказыв,всказ,1
683,свидетельствов,свидетельств,1
684,мариху,марих,1


In [None]:
df_train.to_json("DaNetQA/train_c.jsonl", force_ascii=False, lines=True, orient='records')
df_test.to_json("DaNetQA/test_c.jsonl", force_ascii=False, lines=True, orient='records')
df_validation.to_json("DaNetQA/val_c.jsonl", force_ascii=False, lines=True, orient='records')

In [None]:
df=dfs[3]

In [None]:
df

Unnamed: 0,word_from,word_to,count_replaced
0,выставочныи,выставочны,7
1,станция,станц,7
2,московского,московск,26
3,монорельса,монорельс,2
4,расположена,располож,21
...,...,...,...
50960,мусульманину,мусульманин,1
50961,христианке,христианк,1
50962,колоннады,колоннад,1
50963,ростральных,ростральн,1


In [None]:
df.sort_values(by=['count_replaced'],ascending=False)

Unnamed: 0,word_from,word_to,count_replaced
32,года,год,1802
102,году,год,1101
140,также,такж,984
159,является,явля,709
152,время,врем,689
...,...,...,...
32624,триплоидная,триплоидн,1
32623,аутосом,аутос,1
16557,туман,тума,1
16559,подробностеи,подробн,1


## Random Number Generator

In [21]:
from sklearn.metrics import accuracy_score

In [23]:
df_vaidation = pd.read_json("DaNetQA/val_v1.jsonl", lines=True)
df_vaidation.head()

Unnamed: 0,question,passage,label,idx
0,вод марс,гидросфер марс эт совокупн водн запас планет м...,True,0
1,состо англ евросоюз,полноч 31 январ 1 феврал 2020 год центральноев...,False,1
2,деиствительн ссср адвокат,сем львович ар советск россииск юрист крупнеиш...,False,2
3,чум оран,чум эт абсурд осмыслива форм существован зла э...,True,3
4,кетчуп читос,текущ каталог продукц размещ са производител к...,True,4


In [24]:
validation_ft = df_vaidation['label'].to_numpy()
print(validation_ft[0:10])

[ True False False  True  True  True  True  True  True False]


In [25]:
rng_score = []
for _ in range(5):
    validation_pred = [(True if b == 1 else False) for b in np.random.randint(2, size=( len(validation_ft)))]
    rng_score.append(accuracy_score(validation_ft, validation_pred))

In [26]:
rng_score

[0.48721071863581,
 0.5115712545676004,
 0.4920828258221681,
 0.4713763702801462,
 0.5164433617539586]

## TF-IDF + LogisticRegression

In [28]:
import codecs
import json
from sklearn.linear_model import LogisticRegression
import pickle
import joblib

### Model Define

In [29]:
def build_feature_DaNetQA(row):
    res = str(row["question"]).strip()
    label = row.get("label")
    return res, label

In [30]:
def build_features_DaNetQA(path, vect):
    with codecs.open(path, encoding='utf-8-sig') as reader:
        lines = reader.read().split("\n")
        lines = list(map(json.loads, filter(None, lines)))
    res = list(map(build_feature_DaNetQA, lines))
    texts = list(map(lambda x: x[0], res))
    labels = list(map(lambda x: x[1], res))
    ids = [x["idx"] for x in lines]
    return (vect.transform(texts), labels), ids

In [31]:
def fit_DaNetQA(train, labels):
    clf = LogisticRegression()
    return clf.fit(train, labels)

In [32]:
def eval_DaNetQA(train_path, val_path, test_path, vect):
    train, _ = build_features_DaNetQA(train_path, vect)
    val, _ = build_features_DaNetQA(val_path, vect)
    test, ids = build_features_DaNetQA(test_path, vect)
    clf = fit_DaNetQA(*train)
    try:
        test_score = clf.score(*test)
    except ValueError:
        test_score = None
    test_pred = clf.predict(test[0])
    return clf, {
        "train": clf.score(*train),
        "val": clf.score(*val),
        "test": test_score,
        "test_pred": [{"idx": idx, "label": str(label).lower()} for idx, label in zip(ids, test_pred)]
    }

### Load Pre-Trained TF-IDF

In [None]:
!wget https://russiansuperglue.com/tasks/tf_idf
!unzip tf_idf_baseline.zip
!rm tf_idf_baseline.zip

In [6]:
vect = joblib.load("tfidf.pkl")

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


### Score Model

#### On Raw Data

In [7]:
train_path = "DaNetQA/train.jsonl"
val_path = "DaNetQA/val.jsonl"
test_path = "DaNetQA/test.jsonl"

In [8]:
_, DaNetQA_scores = eval_DaNetQA(train_path, val_path, test_path, vect)
print(f'Accuracy on train data = {DaNetQA_scores["train"]}')
print(f'Accuracy on validation data = {DaNetQA_scores["val"]}')

Accuracy on train data = 0.8010291595197255
Accuracy on validation data = 0.5907429963459196


#### On Pre-Cleaned Data

In [9]:
train_path = "DaNetQA/train_c.jsonl"
val_path = "DaNetQA/val_c.jsonl"
test_path = "DaNetQA/test_c.jsonl"

In [10]:
_, DaNetQA_Cleared_scores = eval_DaNetQA(train_path, val_path, test_path, vect)
print(f'Accuracy on train data = {DaNetQA_Cleared_scores["train"]}')
print(f'Accuracy on validation data = {DaNetQA_Cleared_scores["val"]}')


Accuracy on train data = 0.7004002287021155
Accuracy on validation data = 0.5371498172959805


# Fine tune

### Impot

In [None]:
if 0:
    !pip install tensorflow
    !pip install pandas
    !pip install scipy
    !pip install transformers
    !pip install sklearn

In [33]:
import random
import os
import pandas as pd
import numpy as np
import json

import torch
print(f"Cuda is available: {torch.cuda.is_available()}")

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils import clip_grad_norm_ as clip_grad_norm 

from transformers import BertTokenizer, BertConfig
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification as BertModel

from scipy.special import expit
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

Cuda is available: True


In [34]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [35]:
SEED = 128
MAX_LEN = 256

BATCH_SIZE = 16
BATCH_SIZE_LOADER = 8
EPOCHS_LIMIT = 25
LEARNING_RATE = 3e-5
MAX_GRAD_NORM = 1.0

#### Set Seed

In [36]:
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Utils

In [37]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [110]:
def collectAttentionMask(seq):
    return [float(i > 0) for i in seq]

In [115]:
def collectTokenType(row, sepTokenIdx):
    row = np.array(row)
    mask = row == sepTokenIdx

    whereMask = np.where(mask)[0]
    idx = whereMask[0]
    idx1 = whereMask[1]

    token_type_row = np.zeros(row.shape[0], dtype=np.int32)
    token_type_row[idx + 1:idx1 + 1] = 1
    return token_type_row

In [116]:
def encode_text_pairs(tokenizer, sentences):
    ENCODE_BATCH_SIZE = 20000
    input_ids, attention_masks, token_type_ids = [], [], []
    
    clsTokenText = '[CLS]'
    sepTokenText = '[SEP]'
    sepTokenIdx = tokenizer.convert_tokens_to_ids(sepTokenText)

    TEXT1_MAX = int(MAX_LEN*.75) # выделяет 75% размера слов для контекста
    TEXT2_MAX = MAX_LEN - TEXT1_MAX # остальные слова это вопрос
    for _, i in enumerate(range(0, len(sentences), ENCODE_BATCH_SIZE)):
        # обрезаем предложение слов больше чем MAX_LEN
        tokenized_texts = []
        for sentence_context, sentence_question  in sentences[i:i + ENCODE_BATCH_SIZE]:
            p1 = [clsTokenText] + tokenizer.tokenize(sentence_context)
            p2 = [sepTokenText] + tokenizer.tokenize(sentence_question) + [sepTokenText]
            final_tokens = p1[:TEXT1_MAX] + p2[:TEXT2_MAX]
            tokenized_texts.append(final_tokens)

        # токенизируем
        b_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
        b_input_ids = pad_sequences(
            b_input_ids, 
            maxlen=MAX_LEN, 
            dtype='long', 
            truncating='post', 
            padding='post')
        input_ids.append(b_input_ids)

        # маска внимания
        b_attention_masks = [collectAttentionMask(seq) for seq in b_input_ids]
        attention_masks.append(b_attention_masks)

        # тип токена
        b_token_type_ids = [collectTokenType(row, sepTokenIdx) for row in b_input_ids]
        token_type_ids.append(b_token_type_ids)
        
    return np.vstack(input_ids), np.vstack(attention_masks), np.vstack(token_type_ids)

### Model

In [None]:
!wget "http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz"
!tar -xvzf rubert_cased_L-12_H-768_A-12_pt.tar.gz
!rm rubert_cased_L-12_H-768_A-12_pt.tar.gz

/bin/bash: /home/leysh/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /home/leysh/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /home/leysh/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
wget: /home/leysh/miniconda3/envs/tf/lib/libuuid.so.1: no version information available (required by wget)
--2022-10-27 23:55:14--  http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz
Распознаётся files.deeppavlov.ai (files.deeppavlov.ai)… 178.63.27.41
Подключение к files.deeppavlov.ai (files.deeppavlov.ai)|178.63.27.41|:80... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 301 Moved Permanently
Адрес: https://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz [переход]
--2022-10-27 23:55:14--  https://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12

In [42]:
print(base_path := os.path.abspath(''))
print(out_dir := os.path.join(base_path, 'out'))
print(model_path := os.path.join(base_path, 'rubert_cased_L-12_H-768_A-12_pt/'))
#print(base_path := os.path.abspath( os.path.join('', '..') ))

d:\New folder\New folder\DSnML_Innopolis2022\00_Final_Attestation
d:\New folder\New folder\DSnML_Innopolis2022\00_Final_Attestation\out
d:\New folder\New folder\DSnML_Innopolis2022\00_Final_Attestation\rubert_cased_L-12_H-768_A-12_pt/


#### Read Dataset

In [161]:
parts = ['train_v1', 'val_v1']
#parts = ['train', 'val', 'test']
print(data_path := os.path.join(base_path, 'DaNetQA'))

d:\New folder\New folder\DSnML_Innopolis2022\00_Final_Attestation\DaNetQA


In [162]:
text1_id, text2_id, label_id, index_id = 'passage', 'question', 'label', 'idx'
l2i = {False: 0, True:1}
part2indices = {p:set() for p in parts}

all_ids, all_sentences, all_labels = [], [], []
for p in parts:
    fname = '{}.jsonl'.format(p)
    df = pd.read_json(os.path.join(data_path, fname), lines=True)
    ids = df[index_id].to_numpy()
    all_ids.extend(ids)
    part2indices[p] = ids
    all_labels.extend(df[label_id].to_numpy())
    all_sentences.extend(
        np.array(
            np.column_stack([df[text1_id].to_numpy(), 
            df[text2_id].to_numpy()])
        ).tolist()
    )

all_ids = np.array(all_ids)

In [164]:
print ('len(total)', len(all_sentences))
i2l = {l2i[l]:l for l in l2i}
print ( 'len(l2i)', len(l2i) )

len(total) 2570
len(l2i) 2


#### One-Hot Encode

In [163]:
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path = os.path.join(base_path, model_path),
    do_lower_case=True,
    max_length=MAX_LEN
)

In [165]:
input_ids, attention_masks, token_type_ids = encode_text_pairs(tokenizer, all_sentences)

In [166]:
label_indices = np.array([l2i[l] for l in all_labels])
labels = np.zeros((input_ids.shape[0], len(l2i)))
for _, i in enumerate(label_indices):
    labels[_, i] = 1

#### Prepeare Data Loaders

In [167]:
def createDataLoader(set_ids, all_ids, input_ids, attention_masks, token_type_ids, all_labels):
    mask = np.array([sid in set_ids for sid in all_ids])
    set_ids = all_ids[mask]

    inputs = input_ids[mask], 
    masks = attention_masks[mask], 
    type_ids_dev = token_type_ids[mask]
    labels = all_labels[mask]

    t_inputs = torch.tensor(inputs)
    t_masks = torch.tensor(masks)
    t_type_ids_dev = torch.tensor(type_ids_dev)
    t_labels = torch.tensor(labels)

    t_dataset = TensorDataset(
        t_inputs, 
        t_masks, 
        t_type_ids_dev, 
        t_labels)
    t_sampler = SequentialSampler(t_dataset)

    return DataLoader(
        t_dataset, 
        sampler=t_sampler, 
        batch_size=BATCH_SIZE_LOADER, 
        worker_init_fn=seed_worker)

In [169]:
test_dataloader = createDataLoader(part2indices['val_v1'], 
    all_ids, input_ids, attention_masks, token_type_ids, labels)
train_dataloader = createDataLoader(part2indices['train_v1'], 
    all_ids, input_ids, attention_masks, token_type_ids, labels)
validate_dataloader = createDataLoader(part2indices['val_v1'], 
    all_ids, input_ids, attention_masks, token_type_ids, labels)

In [None]:
print (f'Training set shape: {input_ids_train.shape}')
print (f'Validation set shape: {input_ids_dev.shape}')

#### Load Pre-Trained BERT model

##### Load config

In [None]:
config_path = os.path.join(base_path, model_path, 'bert_config.json')
conf = BertConfig.from_json_file(config_path)
conf.num_labels = len(l2i)

##### Load weights

In [None]:
output_model_file = os.path.join( base_path, model_path, 'pytorch_model.bin' )

##### Init CUDA model

In [38]:
model = BertModel(conf)

model.load_state_dict(torch.load(output_model_file), strict=False)
model = model.cuda()

NameError: name 'BertModel' is not defined


##### Limit learning for BERT layers

In [None]:
param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
]

#### Optimizer & Scheduler
Задаем гиперпараметры для цикла обучения

In [None]:
nStep = len(train_dataloader)

In [None]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(
    optimizer_grouped_parameters, 
    lr=LEARNING_RATE, 
    correct_bias=False)
    
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, 
    max_lr=LEARNING_RATE, 
    steps_per_epoch=nStep, 
    epochs=EPOCHS_LIMIT)



#### Train Loop

In [None]:
cv_res = 0
best_dev_score = -1

In [None]:
train_loss = []
iEpoch = 2
for iEpoch in range(EPOCHS_LIMIT):
    pass

Decompose

cycle

In [None]:
model.train() 
torch.cuda.empty_cache()

tr_loss = 0
nb_tr_examples = 0
nb_tr_steps = 0

for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch
    
    optimizer.zero_grad()

    outputs = model(
        b_input_ids,
        token_type_ids = b_token_type_ids, 
        attention_mask = b_input_mask, 
        labels = b_labels
        )
    loss, logits = outputs[:2]

    train_loss.append(loss.item())
    loss.backward()
    clip_grad_norm(model.parameters(), MAX_GRAD_NORM)

    optimizer.step()
    scheduler.step()

    epochLoss = loss.item()
    tr_loss += epochLoss
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
    
    print(f"Step {step} of {nStep}, loss = {epochLoss}")
avg_train_loss = tr_loss/nb_tr_steps

Step 0 of 161, loss = 0.5834347093477845
Step 1 of 161, loss = 0.6012287070043385
Step 2 of 161, loss = 0.4539189119823277
Step 3 of 161, loss = 0.38429474097210914
Step 4 of 161, loss = 0.41969503299333155
Step 5 of 161, loss = 0.7280048234388232
Step 6 of 161, loss = 0.296478355769068
Step 7 of 161, loss = 0.5101183207007125
Step 8 of 161, loss = 0.5604927400127053
Step 9 of 161, loss = 0.44689390575513244
Step 10 of 161, loss = 0.6486014351248741
Step 11 of 161, loss = 0.4079419504851103
Step 12 of 161, loss = 0.6168186126742512
Step 13 of 161, loss = 0.5110041471198201
Step 14 of 161, loss = 0.5091695527080446
Step 15 of 161, loss = 0.48808223905507475
Step 16 of 161, loss = 0.5731303720385768
Step 17 of 161, loss = 0.5737245150376111
Step 18 of 161, loss = 0.3986828844062984
Step 19 of 161, loss = 0.4671868961304426
Step 20 of 161, loss = 0.6433931519277394
Step 21 of 161, loss = 0.6100153351435438
Step 22 of 161, loss = 0.3719248389825225
Step 23 of 161, loss = 0.318731798324734


In [None]:
### val
model.eval()

predictions = []
tr_loss = 0
nb_tr_steps = 0
for step, batch in enumerate(prediction_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch

    with torch.no_grad():
        outputs = model(
        b_input_ids,
        token_type_ids = b_token_type_ids, 
        attention_mask = b_input_mask, 
        labels = b_labels
        )
        loss, logits = outputs[:2]
        tr_loss += loss.item()
        nb_tr_steps += 1

    logits = logits.detach().cpu().numpy()
    predictions.append(logits)
predictions = expit(np.vstack(predictions))
edev_loss = tr_loss/nb_tr_steps

y_indices, pred = np.argmax(labels_dev, axis=1), np.argmax(predictions, axis=1)
dev_acc = accuracy_score(y_indices, pred)*100
print(f'Epoch {iEpoch} average train_loss: {avg_train_loss:.6f} dev_loss: {edev_loss:.6f} dev_acc {dev_acc:.2f}%')

Epoch 2 average train_loss: 0.529285 dev_loss: 0.383752 dev_acc 87.45%


In [None]:
if dev_acc>best_dev_score: # compute result for test part and store to out file, if we found better model
    best_dev_score = dev_acc
    cv_res = best_dev_score

    predictions, true_labels = [], []
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch
        with torch.no_grad():
            outputs = model( b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask, labels=b_labels )
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append(logits)
        true_labels.append(label_ids)
    predictions = expit(np.vstack(predictions))
    true_labels = np.concatenate(true_labels)
    assert len(true_labels) == len(predictions)
    recs = []
    for idx, l, row in zip(test_ids, true_labels, predictions):
        gt = i2l[np.argmax(l)]
        pred = i2l[np.argmax(row)]
        recs.append( (idx, gt, pred) )

In [None]:
dev_acc = cv_res
print (f'\scores: {dev_acc:.2f}')