# Exp13: Control Text Generation with a locally running LLM

In [30]:
import torch
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
import config
import random
sys.path.append('../src')
import models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load the model and generate three sentences (as indicated by the end of sequence tokens) and print the longest sentence. This re-ranking will be based on the grammar classifiers later.

In [3]:
model = AutoModelForCausalLM.from_pretrained(config.GENERATION_MODEL, device_map="auto", torch_dtype=torch.float16, cache_dir=config.CACHE_DIR)
tokenizer = AutoTokenizer.from_pretrained(config.GENERATION_MODEL, cache_dir=config.CACHE_DIR)

Load the grammar classifiers from the previous step

In [4]:
df = pd.read_json('../dat/egp_merged.json')
level_models = {level: models.load_model(level, df) for level in ["A1", "A2", "B1", "B2", "C1", "C2"]}
cefr_texts = pd.read_csv("../dat/cefr_leveled_texts.csv")

Generate candidates and rank them using the classifiers.

In [33]:
def generate_candidate(input_ids, max_token_sentence = 64, tok_k=10, eos_chars = [".", "!", "?"]):
    generated_tokens = torch.tensor([[]], dtype=torch.int, device=device)
    with torch.no_grad():
        for _ in range(max_token_sentence):
            next_token_logits = model(torch.cat([input_ids, generated_tokens], dim=1)).logits
            probs = torch.nn.functional.softmax(next_token_logits[:, -1, :], dim=-1)
            top_k_probs, top_k_indices = torch.topk(probs, tok_k)
            renormalized_top_k_probs = top_k_probs / top_k_probs.sum()
            top_k_id = torch.multinomial(renormalized_top_k_probs, num_samples=1).item()
            next_token_id = top_k_indices[0, top_k_id]
            
            next_token = tokenizer.decode(next_token_id)
            generated_tokens = torch.cat([generated_tokens, torch.tensor([[next_token_id]]).to(device)], dim=1)
            #print(generated_tokens)
            if any(eos_char in next_token for eos_char in eos_chars):
                break

    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

def write_story(level, story, num_candidates=3, max_len = 1024, add_info=False):
    info = f", which is described as {description[level]}" if add_info else ""
    prompt = f"<s>[INST] Continue the writing on CEFR level {level}{info}. Do not talk about the CEFR level. [/INST] "
    while len(story) < max_len:
        inputs = tokenizer(prompt + story, return_tensors="pt").to(device)
        candidates = [generate_candidate(inputs.input_ids) for i in range(num_candidates)]
        scores = models.get_scores(level_models[level], candidates, )
        mean_scores = torch.mean(scores.float(),dim=1)
        #print(list(zip(candidates, mean_scores)))
        story += " " + candidates[torch.argmax(mean_scores)]
    return story

In [None]:
num_stories = 5
num_candidates = 1
min_length = 50
file_path = "../dat/controlled_generated_texts_mistral_prompt.csv"

In [None]:
storyPrompts = cefr_texts.text.apply(lambda text: text[:text.find(' ', min_length)].strip().lstrip('\ufeff')).unique()
random.shuffle(storyPrompts)

In [34]:
for story in storyPrompts[:num_stories]:
    print("_" * 100)
    print(story)
    for level in level_models.keys():
        print(level)
        text = write_story(level, story, num_candidates, add_info=True)
        print(text)
        #new_row = {"label": level, "story": story, "text": text}
        #pd.DataFrame([new_row]).to_csv(file_path, mode='a', index=False, header=not os.path.exists(file_path))

____________________________________________________________________________________________________
A man believed to have been exposed to the deadly poison
A1
A man believed to have been exposed to the deadly poison ous gas walked into the hospital, panting and coughing. The doctors rushed to examine him, fearing the worst. Nurses quickly set up an isolation unit, preparing for a potentially dangerous situation. 

The man, identified as John, explained that he had accidentally entered a building filled with the harmful substance. John's condition worsened, and the doctors worked tirelessly to save him. As they struggled to understand the extent of his injuries, they administered oxygen and intravenous fluids. 

Despite their efforts, John's health deteriorated rapidly. Fear gripped the hospital staff as they realized the gravity of the situation. They knew that if they failed to save John, they might not get a second chance – for themselves or for any other potential victims. The doc

KeyboardInterrupt: 