In [1]:
# from src.model.data_loader import ParagraphDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset, DataLoader
import pickle
from src.model.loss import ParagraphLoss
from transformers import AdamW
from tqdm import tqdm
import math
from src.model.logger import Logger
import os
import pandas as pd
import rouge
# from src.model.generate_utils  import generate_paragraph
# from src.model.eval_utils import evaluate_doc_model
from src.model.model import GPT2BaseModel


ModuleNotFoundError: No module named 'transformers.modeling_gpt2'

In [None]:
class ParagraphDataset(Dataset):
    def __init__(self, data_file, encoder, max_size=None, n_ctx=102, n_gen=401, include_neigh=False,
                 include_discourse_type=True, include_kw=True, dim=0 ,debug_mode=False):
        with open(data_file, "rb") as f:
            self.data = f.readlines()

        if include_neigh:
            self.prev = []
            fn = ".".join(data_file.split(".")[:-1]) + "_gpt2.pkl"
            if debug_mode:
                fn = ".".join(data_file.split(".")[:-1]) + "_gpt.pkl"
            with open(fn, 'rb') as fp:
                for k in range(len(self.data)):
                    temp = pickle.load(fp)
                    assert temp[0] == k and temp[1] == self.data[k].decode('utf-8', 'ignore').split("\t")[-1].replace(
                        "<o>", "").strip()
                    self.prev.append(temp[2])
        else:
            self.prev = None

        self.dids = []
        for d in range(1, len(self.data)):
            t = self.data[d].decode("utf-8", "ignore").strip().split('\t')
            if len(t) == 7 and t[5].replace("<o>", "").strip() != "":
                try:
                    x, y = int(t[0].split("_")[-1]), int(t[4])
                    self.dids.append(d)
                except:
                    pass

        if max_size is not None:
            self.dids = self.dids[:max_size]
        self.encoder = encoder
        self.ctx = n_ctx - 2
        self.gen = n_gen - 1
        self.dim = dim
        self.len = len(self.data)
        self.include_neigh = include_neigh
        self.include_discourse_type = include_discourse_type
        self.include_kw = include_kw


    def __getitem__(self, index):
        idx = self.dids[index]
        csv_data = self.data[idx].decode("utf-8", "ignore").strip().split('\t')
        kws = csv_data[2].split("[SEP]")
        tgt_phrase = self.encoder.encode(csv_data[5].replace("<o>", ""),  add_special_tokens=False)[:self.gen] # add_prefix_space=True,
        start = torch.LongTensor([self.encoder.bos_token_id])
        clstok = torch.LongTensor([self.encoder.cls_token_id])
        end = torch.LongTensor([self.encoder.eos_token_id])
        tstart = torch.LongTensor([self.encoder.convert_tokens_to_ids('_t_')])
        istart = torch.LongTensor([self.encoder.convert_tokens_to_ids('_i_')])
        bstart = torch.LongTensor([self.encoder.convert_tokens_to_ids('_b_')])
        cstart = torch.LongTensor([self.encoder.convert_tokens_to_ids('_c_')])
        keytok = torch.LongTensor([self.encoder.convert_tokens_to_ids('_kw_')])
        endkeytok = torch.LongTensor([self.encoder.convert_tokens_to_ids('_endkw_')])
        
        if self.include_discourse_type:
            starttyptok = bstart
            if int(csv_data[0].split("_")[-1]) == 0:
                starttyptok = istart
            elif int(csv_data[0].split("_")[-1]) == int(csv_data[4]) - 1:
                starttyptok = cstart
        else:
            starttyptok = clstok

        print(starttyptok)
        
        pad_output = torch.zeros(self.ctx + self.gen + 3).long()
        mask_output = torch.zeros(self.ctx + self.gen + 3).long()

        pad_output[0] = start
        if self.include_kw:
            i = 1
            for k in kws:
                if i - 1 >= self.ctx:
                    break
                enck = self.encoder.encode(k.strip(),  add_special_tokens=False)[:self.ctx - i] # add_prefix_space=True,
                # print(enck, i)
                pad_output[i:i + len(enck)] = torch.LongTensor(enck)
                pad_output[i + len(enck)] = keytok
                i += len(enck) + 1
            pad_output[i - 1] = endkeytok
            mask_output[0:i] = torch.ones(i).long()

        pad_output[self.ctx + 1] = starttyptok if self.include_discourse_type else clstok  # [101] -> discourse tag
        pad_output[self.ctx + 1 + 1:self.ctx + 1 + 1 + len(tgt_phrase)] = torch.LongTensor(tgt_phrase)
        pad_output[self.ctx + 1 + 1 + len(tgt_phrase)] = end

        # Mask
        mask_output[self.ctx + 1:self.ctx + 1 + len(tgt_phrase) + 2] = torch.ones(len(tgt_phrase) + 2).long()

        if self.include_neigh:
            n = torch.FloatTensor(self.prev[idx].flatten())
        else:
            n = torch.zeros(self.dim, dtype=torch.float64)
        return pad_output, mask_output, n

    def __len__(self):
        return len(self.dids)

In [None]:
encoder = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", add_prefix_space=True)

# encoder.add_special_tokens({'bos_token':'_start_',
#                                      'cls_token':'_classify_',
#                                      'eos_token':'_end_',
#                                      'additional_special_tokens': ['_kw_','_endkw_', '_t_', '_i_', '_b_', '_c_']
#                                     })

encoder.add_special_tokens({'additional_special_tokens': ['[SEP]']
                                    })

In [None]:
config = {
    "include_kw": True,
    "include_discourse_type": True,
    "max_size": 10,
    "n_ctx": 102,
    "gen_len": 400,
    "include_neigh": False,
    "dim": 768,
    "lr": 6.25e-5,
    "b1": 0.9,
    "b2": 0.999,
    "e": 1e-8,
    "num_epochs": 1
}

In [None]:
train_dataset = ParagraphDataset('dataset/plot/train_encoded.csv', encoder, max_size=config["max_size"], n_ctx=config["n_ctx"], n_gen=config["gen_len"],
                               include_neigh=config["include_neigh"], include_discourse_type=config["include_discourse_type"], 
                               include_kw=config["include_kw"], dim=config["dim"])

test_dataset = ParagraphDataset('dataset/plot/val_encoded.csv', encoder, max_size=config["max_size"], n_ctx=config["n_ctx"], n_gen=config["gen_len"],
                               include_neigh=config["include_neigh"], include_discourse_type=config["include_discourse_type"], 
                               include_kw=config["include_kw"], dim=config["dim"])

In [None]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=0, drop_last=True)
val_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0, drop_last=True)

In [None]:
for batch in val_loader:
    # print(batch[0][0].size())
    print(encoder.decode(batch[0][0], skip_special_tokens=False))

In [None]:
model = GPT2LMHeadModel.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", output_hidden_states=True)

In [None]:
def run_batch(model, args, device, compute_loss_fct):
    for arg in args:
        if arg is not None:
            arg = arg.to(device)

    output = model(*args)
    
    args[0] = args[0].to(device)
    args[1] = args[1].to(device)
    
    allloss = compute_loss_fct(output, args[0], args[1])
    
    return allloss.mean()

In [None]:
def run_epoch(bestloss, start_iter, running_loss, model, compute_loss_fct, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k,p, decoding_strategy, accum_iter, desc_str, save_dir, logger, text_encoder, show_progress=False, summary_loss=None, my_local_dir='checkpoints_local'):
    '''
    Run a single epoch, log results, and save best checkpoint
    '''
    if show_progress:
        train_bar = tqdm(iterable=train_loader, desc=desc_str)
    else:
        train_bar = train_loader

    for i, batchargs in enumerate(train_bar, start_iter):
        
        num_updates = i // accum_iter
        model.train()
        loss = run_batch(model, batchargs, device, compute_loss_fct)

        loss.backward()

        running_loss += float(loss.detach().item())
        if show_progress:
            train_bar.set_postfix(loss=running_loss / ((train_log_interval * accum_iter) if num_updates % train_log_interval == 0 and num_updates != 0 else i % (train_log_interval * accum_iter)))

        if i % accum_iter == 0:
            model_opt.step()
            model_opt.zero_grad()
            torch.cuda.empty_cache()
        if num_updates % train_log_interval == 0 and i % accum_iter == 0:
            logger.scalar_summary("Training", num=running_loss, denom=(train_log_interval * accum_iter), step=num_updates)
            print("training loss %.2f" % (running_loss/float(train_log_interval * accum_iter)))
            running_loss = 0

        # if num_updates % 1000 == 0 and i % accum_iter == 0:
        #     val_loss, scores = evaluate(val_loader, train_log_interval, model, text_encoder, device, beam, gen_len, k, p, decoding_strategy, compute_loss_fct, min_len=args.min_len)

        #     logger.scalar_summary("Validation", num=val_loss, denom=len(val_loader), step=num_updates)
        #     # if sum(val_loss) < bestloss or bestloss == -1:
        #     lv = get_loss_value(val_loss, len(val_loader))
        #     if (not math.isnan(lv)) and (bestloss == -1 or lv < bestloss):
        #         bestloss = lv
        #         save_checkpoint(i + 1, running_loss, model.state_dict(), model_opt.state_dict(), save_dir, my_local_dir)


    # val_loss, scores = evaluate(val_loader, train_log_interval, model, text_encoder, device, beam, gen_len, k, p, decoding_strategy, compute_loss_fct, min_len=args.min_len)
    # for key, value in scores.items():
    #     for key2, value2 in value.items():
    #         logger.rouge_summary("{}/{}".format(key, key2), value2, num_updates)
    # print("Validation rouge: " + str(scores.items()))
    # logger.scalar_summary("Validation", num=val_loss, denom=len(val_loader), step=num_updates)
    # lv = get_loss_value(val_loss, len(val_loader))
    # if (not math.isnan(lv)) and (bestloss == -1 or lv < bestloss):
    #     bestloss = lv
    #     save_checkpoint(i + 1, running_loss, model.state_dict(), model_opt.state_dict(), save_dir, my_local_dir)


    torch.cuda.empty_cache()
    return i + 1, running_loss, bestloss, num_updates # , lv

In [None]:
output_dir = 'savedir'
experiment_name = 'gpt3'
print("Creating directories")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, experiment_name), exist_ok=True)
os.makedirs(os.path.join(output_dir, experiment_name), exist_ok=True)


save_dir = os.path.join(output_dir, experiment_name, "checkpoints")
save_dir_local = "checkpoints_local"
desc = "Desc"
data_dir = 'dataset/plot'
log_dir = os.path.join(output_dir, experiment_name, "logs")
os.makedirs(log_dir, exist_ok=True)
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir_local, exist_ok=True)

In [None]:
train_log_interval = 4
val_log_interval = 4
beam = 0
p = 90
k = 0
decoding_strategy = 0
accum_iter = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger = Logger(log_dir)

In [None]:
criterion = torch.nn.CrossEntropyLoss(reduction="none")

model_opt = AdamW(filter(lambda p : p.requires_grad, model.parameters()),
                        lr=config['lr'],
                        betas=(config['b1'], config['b2']),
                        eps=config['e'])



lm_loss = ParagraphLoss(criterion, n_ctx=config["n_ctx"], gen_len=config["gen_len"])

In [None]:
bestloss = -1
start_iter, running_loss = 1,0
prevloss = 1000

for i in range(config['num_epochs']):
    start_iter, running_loss, bestloss, updates, val_loss1 = run_epoch(bestloss, start_iter, running_loss, model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, config['gen_len'], k, p, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, config['num_epochs']), save_dir, logger, encoder, show_progress=True, my_local_dir='save_dir_local')
    print("VAL LOSS: ", str(val_loss1))
    if val_loss1 > prevloss or math.isnan(val_loss1):
        break
    prevloss = val_loss1

# generate

In [None]:
df = pd.read_csv('dataset/plot/val_encoded.csv', sep='\t')
print(df.head())

In [None]:
context = " ".join(df['[KEYWORDS]'][0].split('[SEP]'))

In [None]:
input_ids = encoder.encode(context, return_tensors='pt')
print(context)
print(input_ids.size())

In [None]:
beam_output = model.generate(
    input_ids, 
    max_length=400, 
    num_beams=5, 
    no_repeat_ngram_size=2,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(encoder.decode(beam_output[0], skip_special_tokens=True))

In [None]:
pad_output, attention_mask, n = next(iter(val_loader))

In [None]:
input_toks = pad_output[:, :config['n_ctx']] # includes delimiter
target_toks = pad_output[:, config['n_ctx']:]
target_toks = target_toks[:, attention_mask[:, config['n_ctx']:][0] == 1]

XMB = pad_output[:, :config['n_ctx']]
mask = attention_mask[:, :config['n_ctx']]

In [None]:
print(encoder.decode(XMB[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
print(encoder.decode(target_toks[0], skip_special_tokens=True))

In [None]:
beam_output = model.generate(
    XMB, 
    attention_mask=mask,
    min_length = 100,
    max_length=512, 
    num_beams=5, 
    no_repeat_ngram_size=2,
    early_stopping=True, 
    eos_token_id=encoder.eos_token_id,
    num_return_sequences=2    
)

print("Context:\n" + 100 * '-')
print(encoder.decode(XMB[0], skip_special_tokens=True))
print()
print("Output:\n" + 100 * '-')
print(encoder.decode(beam_output[:, config['n_ctx']:][0], skip_special_tokens=True))

In [None]:
refs = encoder.decode(XMB[0], skip_special_tokens=True)
hyps1 = encoder.decode(beam_output[:, config['n_ctx']:][0], skip_special_tokens=True)
hyps2 = encoder.decode(beam_output[:, config['n_ctx']:][1], skip_special_tokens=True)

print("Context:\n" + 100 * '-')
print(refs)
print("Output:\n" + 100 * '-')
print(hyps1)
print(hyps2)

In [None]:
def get_average_scores(hyps, refs):       
    rouge_scorer = rouge.Rouge()
    averaged_scores = rouge_scorer.get_scores(hyps, refs, avg=True)
    return averaged_scores

In [None]:
print(get_average_scores(hyps2, refs))
# scores = rouge_scorer.get_scores(hyps, hyps, avg=True)
# print(scores)

In [None]:
sample_output = model.generate(
    XMB, 
    attention_mask=mask,
    max_length=512, 
    do_sample=True,
    top_p=0.90, 
    top_k=0,
    eos_token_id=encoder.eos_token_id,
    min_length = 100,
    # num_return_sequences=2, 
    temperature=0.7, # не используется в пломашинс
    # no_repeat_ngram_size=2, # не используется в пломашинс
    repetition_penalty = 1.5
)

In [None]:
XMB[0].size()

In [None]:
refs = encoder.decode(XMB[0], skip_special_tokens=False)
hyps1 = encoder.decode(sample_output[:, config['n_ctx']:][0], skip_special_tokens=False)

print("Context:\n" + 100 * '-')
print(refs)
print("Output1:\n" + 100 * '-')
print(hyps1)

In [None]:
refs = encoder.decode(XMB[0], skip_special_tokens=True)
hyps1 = encoder.decode(sample_output[:, config['n_ctx']:][0], skip_special_tokens=True)
hyps2 = encoder.decode(sample_output[:, config['n_ctx']:][1], skip_special_tokens=True)

print("Context:\n" + 100 * '-')
print(refs)
print("Output1:\n" + 100 * '-')
print(hyps1)
print("Output2:\n" + 100 * '-')
print(hyps2)

In [None]:
print("hyps1:", get_average_scores( hyps1, refs))
print("hyps2:", get_average_scores( hyps2, refs))

In [None]:
from src.model.generate_utils import toks_to_str

In [None]:
encoder._convert_id_to_token()

In [None]:
import codecs

In [None]:
from tokenizers.decoders import ByteLevel
decoder = ByteLevel()


In [None]:
str_rep = []
end_tok = encoder.convert_tokens_to_ids('_end_')

for token in sample_output[0]:
    print(token.item(), repr(decoder.decode([ encoder.convert_ids_to_tokens(token.item(), skip_special_tokens=True)])))
    if token.item() == end_tok : #or token.item() == 0:# or x.item() == end_idx:
        break        
    str_rep.append(encoder.convert_ids_to_tokens(token.item()))

str_rep = encoder.convert_tokens_to_string(str_rep)

# This makes sure rouge scorers doesn't complain about no sentences
if not str_rep:
    str_rep = "unk."
elif "." not in str_rep:
    str_rep += "."

print(encoder.decode(sample_output[0], skip_special_tokens=False, clean_up_tokenization_spaces=False))
print("-"*50)
print(str_rep)

In [None]:
toks_to_str(sample_output[0], encoder)

# evaluate doc

In [None]:
class Config:
    repeattheta = 1.5
    output_attentions = True

In [None]:
args = Config()

In [None]:
vocab = len(encoder)

In [None]:
doc_model = GPT2BaseModel(args, vocab=vocab, n_ctx=config['n_ctx'], gen_len=401, lastidx=encoder.eos_token_id, includeprev=False, device='cpu')

In [None]:
evaluate_doc_model(model=doc_model, val_loader=val_loader, text_encoder=encoder, device='cpu', beam=0, gen_len=401, k=0, p=90, save_file='out', max_len=512, gen_dir=None, tgt_dir=None, min_len=100)

In [None]:
import json

In [None]:
with open('text.txt', 'w', encoding='utf-8') as f:
    json.dump("Моя строка", f, ensure_ascii=False)

In [None]:
df = pd.read_csv('generated/test.gens.tsv', sep='\t', header=None, names=['id', 'plot', 'context', 'part', 'text'])
df.head()

In [None]:
df.text[0]

# Rake

In [None]:
from rake_nltk import Rake
import os
import re
from nltk.corpus import stopwords

In [None]:
print(stopwords.words('russian'))

In [None]:
def sorting(lst):
    lst2 = sorted(lst, key=len)
    return lst2

In [None]:
def clean_top_features(keywords, top=10):
    keywords = sorting(keywords)
    newkeys = []
    newkeys.append(keywords[len(keywords)-1])
    for i in range(len(keywords)-2, -1, -1):
        if newkeys[len(newkeys)-1].startswith(keywords[i]):
            continue
        newkeys.append(keywords[i])

    if len(newkeys) > top:
        return newkeys[:top]
    return newkeys

In [None]:
files = os.listdir('dataset/raw') 
print(len(files))

In [None]:
r = Rake(language='russian', stopwords=stopwords.words())
topK = 10
with open(os.path.join('dataset/raw', files[0]), 'r', encoding='utf-8') as f:
    text = f.read()
    print(repr(text))
    pars = text.strip().split('\n')
    sentences = []
    for par in pars:
        sentences.extend(re.sub("[\?!:]", ".", par).strip(".").split("."))
    print('\n', sentences)
    r.extract_keywords_from_sentences(sentences)
    top_features = r.get_ranked_phrases()
    print('\n', top_features)
    if len(top_features) > topK:
        top_features = top_features[:topK]
    print('\n', top_features)
    # top_features = clean_top_features(top_features, 10)
    # print('\n', top_features)

# Raw Dataset

temperature = 0.9, top_k = 0, top_p = 0.95

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset, DataLoader
from src.model.data_full import RawFilesDataset
import os
from sklearn.model_selection import train_test_split

In [3]:
corpus1_path = 'dataset/raw'
corpus2_path = 'dataset/raw_other'
corpus1_files = [os.path.join(corpus1_path, name) for name in os.listdir(corpus1_path)]
corpus2_files = [os.path.join(corpus2_path, name) for name in os.listdir(corpus2_path)]

In [4]:
train, val_test = train_test_split(corpus1_files, test_size=0.4)
val, test = train_test_split(val_test, test_size=0.5)

train.extend(corpus2_files)

In [6]:
encoder = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", add_prefix_space=True)

encoder.add_special_tokens({'bos_token': '<s>',                                     
                                     'eos_token': '</s>',
                                     'additional_special_tokens': ['[SEP]']
                                    })


1

In [13]:
train_dataset = RawFilesDataset(train, encoder, 2048, n_ctx=70)
train_loader = DataLoader(train_dataset, 4, shuffle=True)

val_dataset = RawFilesDataset(val, encoder, 2048, n_ctx=70)
val_loader = DataLoader(val_dataset, 4, shuffle=False)

In [16]:
print(len(val_dataset))

72


In [14]:
print(next(iter(train_loader)))

{'sample': tensor([[   1,  262, 2133,  ...,    2,    2,    2],
        [   1, 2936,  315,  ...,    2,    2,    2],
        [   1,  436,  272,  ...,    2,    2,    2],
        [   1,  298, 2147,  ...,    2,    2,    2]]), 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'label': tensor([[   1,  262, 2133,  ..., -100, -100, -100],
        [   1, 2936,  315,  ..., -100, -100, -100],
        [   1,  436,  272,  ..., -100, -100, -100],
        [   1,  298, 2147,  ..., -100, -100, -100]])}


In [12]:
text = "<s> какой то текст </s>"

encoded_ids = encoder.encode(text)
print(encoded_ids)

[1, 1851, 502, 6529, 2]


In [2]:
# from transformers.models import GPT2LMHeadModel

with open('savedir/baseline/checkpoints/checkpoint.pt', 'rb') as f:
    # model.load_state_dict(torch.load(f))
    model_2 = torch.load(f)