In [1]:
import os
import time
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
print("Imports completed")

Imports completed


In [2]:
#########
INPUT_FILE = "../data_preproc-ed/data_subset_en_5000_from_ew-sew_08_24_18h"
#########
if not os.path.exists(INPUT_FILE):
    print("File not found. Files in the directory:")
    print(os.listdir(os.path.dirname(INPUT_FILE)))

In [14]:
prompts = [
    "Simplify the following new complex sentence. Here's an example: Complex: Hitherto, smoking was prohibited in the facilities. Simple: Smoking was forbidden here until now. New complex sentence: ",
    "Rewrite the following sentence to be as simple and concise as possible, while keeping all the essential information intact: ",
    "Make this sentence easier to understand: ",
    "Simplify this original sentence. The result should be a sentence with a easy sentence structure and should include only easy words. Original sentence: ",
    "Simplify this original sentence by creating an alternative version with a simpler sentence structure and fewer difficult words. Original sentence: ",
    "Make an alternative version of this original sentence which is shorter, exhibits a simpler structure and easier words.  For example, \"Hitherto, smoking was prohibited in the facilities.\"  could become \"Smoking was forbidden here until now.\". Original sentence: ",
    "Explain the following to a five year old: ",
    "Simplify this sentence. Change it as much as possible without altering the meaning. Sentence: "

]


language = INPUT_FILE.split('_')[2]
num_sents = INPUT_FILE.split('_')[3]
source_dataset = INPUT_FILE.split('_')[5]
preprocessing_month = INPUT_FILE.split('_')[6]
preprocessing_day = INPUT_FILE.split('_')[7]
preprocessing_hour = INPUT_FILE.split('_')[8][-2:-1]

execution_start_time = time.time()

print(len(prompts), " prompts available")
print(language, num_sents, source_dataset, preprocessing_month, preprocessing_day, preprocessing_hour)

8  prompts available
subset en from ew-sew 08 2


In [4]:
cache_dir = os.environ.get('HUGGINGFACE_HUB_CACHE')
print(f"HUGGINGFACE_HUB_CACHE is set to: {cache_dir}")

# Verify if the directory exists
cache_dir = os.environ.get('HUGGINGFACE_HUB_CACHE')
if cache_dir and os.path.isdir(cache_dir):
    print(f"The cache directory {cache_dir} exists.")
else:
    print(f"The cache directory {cache_dir} does not exist or is not accessible.")

HUGGINGFACE_HUB_CACHE is set to: /data/huggingface
The cache directory /data/huggingface exists.


In [5]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xxl")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [15]:
with open(INPUT_FILE, "r") as file:
    data = [tuple(line.strip().split('|')) for line in file if line.strip()]

print(data[:5])

sentences = [sentence for _, sentence in data]

counter = 0

instruction_prompt_used = prompts[-1]
print('_____\n', instruction_prompt_used, "\n", '_____')

for sent_num, sent in enumerate(sentences):

    
    input_text = instruction_prompt_used + sent

    # Tokenize the input
    inputs = tokenizer(input_text , return_tensors="pt")
    input_length = len(inputs["input_ids"][0])
    dyn_max_length = input_length + 50
    
    # Perform inference
    outputs = model.generate(**inputs, max_length=input_length)
    # Decode and print the result
    generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if counter % 5 == 0:
        print("___________________")
        print("Inference on sentence ", sent_num+1, "/", len(sentences), "after ", time.strftime("%H:%M:%S", time.gmtime(time.time() - execution_start_time)))
        print("Using dyn_max_length of ", dyn_max_length)
        print(" Input: ", sent)
        print("Output: ", generated_output)


[('57224', 'Toxicofera (Greek for "those who bear toxins") , is a hypothetical clade which represents about 4600 species (nearly 60 %) of extant squamates (scaled lizards.)'), ('117533', 'The islands of the Caribbean Sea, collectively known as the West Indies, are sorted by size and location into the Bahamas (or Lucayan archipelago, which includes the Turks and Caicos Islands) , the Lesser Antilles, and the Greater Antilles.'), ('3709', 'Some websites do not allow typographic quotation marks or apostrophes in posts (one such example being YouTube) .'), ('50928', 'Ecological yield is the harvestable population growth of an ecosystem.'), ('126524', "When he was 16-years-old, Davey's mother left with him to England in 1931.")]
_____
 Simplify this sentence. Change it as much as possible without altering the meaning. Sentence:  
 _____
___________________
Inference on sentence  1 / 5000 after  00:01:06
Using dyn_max_length of  129
 Input:  Toxicofera (Greek for "those who bear toxins") , i

KeyboardInterrupt: 

In [None]:
execution_elapsed_time = time.time() - execution_start_time
print("Code execution completed after ", time.strftime("%H:%M:%S", time.gmtime(execution_elapsed_time)) + f".{int((execution_elapsed_time % 1) * 1000):03d}")