In [1]:
import pandas as pd
import torch
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

## Define checkpoints

In [2]:
cache_dir = './cache_mod'
checkpoint_t5 = "google/flan-t5-large"
checkpoint_dolly = "databricks/dolly-v2-2-8b"


In [3]:
def generate_text(model, tokenizer, prompt, is_pipeline=False, max_new_tokens=100):
    if is_pipeline:
        return model(prompt)[0]['generated_text']
    else:
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return "\n".join(result)

## Read sample data file

In [4]:
df = pd.read_csv("./Text_Simplification/trial_smpl_medium.csv")
df.head(2)

Unnamed: 0,data_source,source_level_og,target_level_og,Unnamed: 3,source,target,source_level_og.1,target_level_og.1,data_source.1,data_type,source_level_cefr,target_level_cefr,id
0,BreakingNewsEnglish,2.0,1.0,1587,Donald Trump is interested in buying Greenland...,Donald Trump is interested in buying Greenland...,2.0,1.0,BreakingNewsEnglish,text_simplification,,,TS000001588
1,BreakingNewsEnglish,2.0,1.0,1749,Everyone knows that children don't like eating...,Everyone knows children don't like eating gree...,2.0,1.0,BreakingNewsEnglish,text_simplification,,,TS000001750


In [5]:
df1 = df.head(10)

## Run simplification for various prompts

In [None]:
%%time

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

prompts = ["Simplify","Simplify to elementary level","Simplify to CEFR A1","Simplify to intermediate level", "Simplify to CEFR B1"]

model_checkpoints = {
    'dolly': checkpoint_dolly,
    'flant5': checkpoint_t5
}

    
for model_name, checkpoint in model_checkpoints.items():
    if checkpoint == checkpoint_dolly:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        model = pipeline(model=checkpoint, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
        is_pipeline = True
        print("Running dolly...")
    else:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, cache_dir=cache_dir)
        is_pipeline = False
        print("Running t5...")

    for inst in prompts:
        df[f"{model_name}-{inst}"] = df['source'].apply(lambda x: generate_text(model, tokenizer, inst + ': ' + x, is_pipeline=is_pipeline))
        
    print("Done!\n")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Running dolly...


In [None]:
df.to_csv("./Text_Simplification/simplified_df_new.csv")