In [None]:
from structure_aware_text_generation.structure_extractor import StructureExtractor, StructureSet
from structure_aware_text_generation.generator import Generator
from structure_aware_text_generation.evaluator import Evaluator
import json

with open('intelligencer_texts.json') as f:
    texts = json.load(f)[:200]
texts[2]

['If the president in question was anyone else, the suggestion from a prominent supporter, Jerry Falwell Jr., that he was owed an extension of his term as reparations for the Mueller investigation could be dismissed as mere rhetorical excess combined with an effort to taunt liberals for their interest in reparations for the descendants of slaves.',
 'But because Donald Trump has repeatedly fanned claims that he regards anything that gets in his way say, an adverse 2020 election as an extra-constitutional coup, its not so easy to laugh this off: The president retweeted this little stink bomb and added his own pithy characterization of the investigation his own administration began: Mueller aside, Trump has often defended his own chronic assaults on the rule of law by accusing those seeking to restrain him, via lawsuits, congressional investigations, or the constitutionally sanctioned method of impeachment, of defying democratic norms by trying to reverse the results of the 2016 election

#### Извлечение структур

In [None]:
extractor = StructureExtractor()

filtered_structure_set = extractor(texts, n = 3, max_per_theme_start=5, topics_number_range=(20, 50, 2))
extracted_structures = filtered_structure_set.get_structures()
extracted_structures

100%|██████████| 200/200 [00:15<00:00, 13.03it/s]
100%|██████████| 15/15 [00:21<00:00,  1.45s/it]


{'0': [{'count': 8, 'struct': [0, 0, 0]},
  {'count': 8, 'struct': [0, 6, 16]},
  {'count': 7, 'struct': [0, 3, 0]},
  {'count': 7, 'struct': [0, 6, 6]},
  {'count': 6, 'struct': [0, 16, 16]}],
 '1': [{'count': 69, 'struct': [1, 1, 1]},
  {'count': 10, 'struct': [1, 13, 1]},
  {'count': 9, 'struct': [1, 6, 1]},
  {'count': 9, 'struct': [1, 1, 17]},
  {'count': 9, 'struct': [1, 1, 6]}],
 '10': [{'count': 18, 'struct': [10, 10, 10]},
  {'count': 8, 'struct': [10, 13, 13]},
  {'count': 6, 'struct': [10, 10, 17]},
  {'count': 5, 'struct': [10, 3, 3]},
  {'count': 5, 'struct': [10, 17, 10]}],
 '11': [{'count': 3, 'struct': [11, 3, 11]},
  {'count': 46, 'struct': [11, 11, 11]},
  {'count': 3, 'struct': [11, 11, 17]},
  {'count': 3, 'struct': [11, 17, 11]},
  {'count': 3, 'struct': [11, 19, 12]}],
 '12': [{'count': 31, 'struct': [12, 12, 12]},
  {'count': 11, 'struct': [12, 19, 12]},
  {'count': 8, 'struct': [12, 12, 19]},
  {'count': 6, 'struct': [12, 17, 17]},
  {'count': 5, 'struct': [12, 

#### Генерация

In [None]:
from transformers import GPT2Tokenizer, GPT2PreTrainedModel, GPT2Model, AdamW, get_linear_schedule_with_warmup
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

generator = Generator(gpt2_model, gpt2_tokenizer, model)

**Пример генерации с опорой на вектор (генератор стремится приблизить вектор предложения к заданному)**

In [None]:
sentence = 'This text is really simple.'
vector = model.encode(sentence, convert_to_tensor=True)
generated_text = generator.generate_with_vector('This text', vector)
generated_text

'This text seems somewhat appropriate -- though also not completely important here in these ways."'

In [None]:
import torch

cos = torch.nn.CosineSimilarity(dim=-1)
cos(model.encode(generated_text, convert_to_tensor=True), vector)

tensor(0.6642, device='cuda:0')

**Проведем такое же сравнение с простой генерацией**

In [None]:
simple_generated_text = generator.simple_generate('This text')
simple_generated_text

'This text provides guidance where most UMass researchers could put those methods into more sophisticated simulations—i-direction machines are particularly fast but do come much beyond 3MHz or 12 to 22 million digits on GACES\'s TDS network[21e,32d–43]."'

In [None]:
cos(model.encode(simple_generated_text, convert_to_tensor=True), vector)

tensor(0.0667, device='cuda:0')