In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Define your dataset class
class SimplificationDataset(Dataset):
    def __init__(self, complex_texts, simplified_texts, tokenizer, max_len=128):
        self.complex_texts = complex_texts
        self.simplified_texts = simplified_texts
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.complex_texts)
    
    def __getitem__(self, index):
        complex_text = self.complex_texts[index]
        simplified_text = self.simplified_texts[index]
        
        # Tokenize inputs and labels
        inputs = self.tokenizer.encode_plus(
            complex_text, 
            max_length=self.max_len, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        labels = self.tokenizer.encode_plus(
            simplified_text, 
            max_length=self.max_len, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        label_ids = labels['input_ids'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label_ids
        }

# Function to load dataset (example for LexMTurk)
def load_lexmturk_data():
    # Load LexMTurk dataset (complex and simplified sentences)
    # Example pairs; in practice, load from dataset files
    complex_sentences = [
        "The researcher extrapolated a comprehensive analysis of the convoluted data."
    ]
    simplified_sentences = [
        "The researcher made a thorough analysis of the complex data."
    ]
    return complex_sentences, simplified_sentences

# Fine-tuning function
def fine_tune_t5(model, tokenizer, dataset, epochs=3, batch_size=8, learning_rate=1e-4):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Use AdamW optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    model.train()  # Set the model to training mode
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}")

# Main script
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load a pre-trained T5 model and tokenizer
    model_name = "t5-small"  # Can choose "t5-base" or "t5-large" for larger models
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    # Load the dataset
    complex_texts, simplified_texts = load_lexmturk_data()
    dataset = SimplificationDataset(complex_texts, simplified_texts, tokenizer)
    
    # Fine-tune the model
    fine_tune_t5(model, tokenizer, dataset, epochs=3, batch_size=4, learning_rate=1e-4)

    # Save the fine-tuned model
    model.save_pretrained('./fine_tuned_t5')
    tokenizer.save_pretrained('./fine_tuned_t5')


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x3ffad636bc0>>
Traceback (most recent call last):
  File "/home/jovyan/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Error in cpuinfo: processor architecture is not supported in cpuinfo
Error in cpuinfo: processor architecture is not supported in cpuinfo


Epoch 1/3, Loss: 13.400545120239258
Epoch 2/3, Loss: 16.733417510986328
Epoch 3/3, Loss: 15.273792266845703


In [2]:
print("Helloe")

Helloe


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Define your dataset class
class SimplificationDataset(Dataset):
    def __init__(self, complex_texts, simplified_texts, tokenizer, max_len=128):
        self.complex_texts = complex_texts
        self.simplified_texts = simplified_texts
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.complex_texts)
    
    def __getitem__(self, index):
        complex_text = self.complex_texts[index]
        simplified_text = self.simplified_texts[index]
        
        # Tokenize inputs and labels
        inputs = self.tokenizer.encode_plus(
            complex_text, 
            max_length=self.max_len, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        labels = self.tokenizer.encode_plus(
            simplified_text, 
            max_length=self.max_len, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        label_ids = labels['input_ids'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label_ids
        }

# Function to load BenchLS dataset from file
def load_benchls_data(file_path):
    complex_sentences = []
    simplified_sentences = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Each line contains a complex sentence and simplified sentence separated by a tab
            parts = line.strip().split('\t')
            if len(parts) == 2:
                complex_sentences.append(parts[0])
                simplified_sentences.append(parts[1])
    
    return complex_sentences, simplified_sentences

# Fine-tuning function
def fine_tune_t5(model, tokenizer, dataset, epochs=3, batch_size=8, learning_rate=1e-4):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Use AdamW optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    model.train()  # Set the model to training mode
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}")


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a pre-trained T5 model and tokenizer
model_name = "t5-small"  # Can choose "t5-base" or "t5-large" for larger models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

In [6]:
# Load the BenchLS dataset from the text file
file_path = 'BenchLS.txt'  # Path to your BenchLS.txt file
complex_texts, simplified_texts = load_benchls_data(file_path)

# Prepare the dataset
dataset = SimplificationDataset(complex_texts, simplified_texts, tokenizer)


In [7]:
# Fine-tune the model
fine_tune_t5(model, tokenizer, dataset, epochs=3, batch_size=4, learning_rate=1e-4)

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_t5')
tokenizer.save_pretrained('./fine_tuned_t5')

ValueError: num_samples should be a positive integer value, but got num_samples=0

In [8]:
complex_texts, simplified_texts = load_benchls_data(file_path)
print(f"Loaded {len(complex_texts)} sentence pairs from BenchLS.txt")


Loaded 0 sentence pairs from BenchLS.txt


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Dataset class that extracts sentences and replaces the complex word with a synonym
class BenchLSDataset(Dataset):
    def __init__(self, sentences, complex_words, simplified_words, tokenizer, max_len=128):
        self.sentences = sentences
        self.complex_words = complex_words
        self.simplified_words = simplified_words
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        sentence = self.sentences[index]
        complex_word = self.complex_words[index]
        simplified_word = self.simplified_words[index]
        
        # Replace the complex word in the sentence with the simplified word
        simplified_sentence = sentence.replace(complex_word, simplified_word)
        
        # Tokenize inputs and labels
        inputs = self.tokenizer.encode_plus(
            sentence, 
            max_length=self.max_len, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        labels = self.tokenizer.encode_plus(
            simplified_sentence, 
            max_length=self.max_len, 
            padding='max_length', 
            truncation=True, 
            return_tensors="pt"
        )
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        label_ids = labels['input_ids'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label_ids
        }

# Function to load BenchLS dataset from file
def load_benchls_data(file_path):
    sentences = []
    complex_words = []
    simplified_words = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 5:  # Ensure correct number of fields
                sentence = parts[0]
                complex_word = parts[1]
                # Extract the best possible simplified synonym (in this case, use the first synonym as an example)
                synonyms = parts[3:]  # synonyms are listed after the complex word and position
                best_synonym = synonyms[0].split(":")[1]  # Get the first synonym
                
                sentences.append(sentence)
                complex_words.append(complex_word)
                simplified_words.append(best_synonym)
            else:
                print(f"Skipping invalid line: {line}")
    
    return sentences, complex_words, simplified_words

# Fine-tuning function remains the same
def fine_tune_t5(model, tokenizer, dataset, epochs=3, batch_size=8, learning_rate=1e-4):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Use AdamW optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    model.train()  # Set the model to training mode
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}")



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a pre-trained T5 model and tokenizer
model_name = "t5-small"  # Can choose "t5-base" or "t5-large" for larger models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Load the BenchLS dataset from the text file
file_path = 'BenchLS.txt'  # Path to your BenchLS.txt file
sentences, complex_words, simplified_words = load_benchls_data(file_path)

# Prepare the dataset
dataset = BenchLSDataset(sentences, complex_words, simplified_words, tokenizer)

# Fine-tune the model
fine_tune_t5(model, tokenizer, dataset, epochs=3, batch_size=4, learning_rate=1e-4)

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_t5')
tokenizer.save_pretrained('./fine_tuned_t5')

Skipping invalid line: they locate food by smell , using sensors in the tip of their snout , and regularly feast on ants and termites .	snout	13	1:nose

Skipping invalid line: the latter means basic or radical change ; whereas reform may be no more than fine tuning , or at most redressing serious wrongs without altering the fundamentals of the system .	altering	25	1:changing

Skipping invalid line: the band is known for its large line-up , which consists of nine members ; including a vocalist , two guitarists , a bassist , two percussionists in addition to a primary drummer , a sampler , and a turntablist .	vocalist	17	1:singer

Skipping invalid line: during the final immunity challenge , Sandra fell out early , allowing Jon to try to convince Lil to take him to the final two .	final	23	1:last

Skipping invalid line: Italy purchased the city in 1905 and made Mogadishu the capital of Italian Somaliland .	purchased	1	1:bought

Skipping invalid line: he aided the Doukhobors in migrating t



Epoch 1/3, Loss: 1.0408601090971348


In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Function to generate predictions using the fine-tuned model
def simplify_text(model, tokenizer, text, max_len=128):
    model.eval()  # Set the model to evaluation mode
    
    # Prepare the input
    input_text = f"simplify: {text}"
    inputs = tokenizer.encode_plus(
        input_text, 
        max_length=max_len, 
        padding='max_length', 
        truncation=True, 
        return_tensors="pt"
    )
    
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            max_length=max_len,
            num_beams=4,  # You can adjust the number of beams for better quality
            early_stopping=True
        )
    
    # Decode the generated text
    simplified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return simplified_text

# Load the fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = './fine_tuned_t5'  # Path to the fine-tuned model
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)

# Example to test the model
if __name__ == "__main__":
    # Original text containing complex words
    original_text = "Escapologists escape from handcuffs, straitjackets, coffins, and other perils."
    
    # Get the simplified text using the fine-tuned model
    simplified_text = simplify_text(model, tokenizer, original_text)
    
    print(f"Original text: {original_text}")
    print(f"Simplified text: {simplified_text}")


Error in cpuinfo: processor architecture is not supported in cpuinfo
Error in cpuinfo: processor architecture is not supported in cpuinfo


Original text: Escapologists escape from handcuffs, straitjackets, coffins, and other perils.
Simplified text: simplify: Escapologists escape from handcuffs, straitjackets, coffins, and other perils.


In [10]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm  # Progress bar

# Function to simplify text using the fine-tuned model
def simplify_text(model, tokenizer, text, max_len=128):
    model.eval()  # Set the model to evaluation mode

    # Prepare the input
    input_text = f"simplify: {text}"
    inputs = tokenizer.encode_plus(
        input_text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_len,
            num_beams=4,  # You can adjust the number of beams for better quality
            early_stopping=True
        )

    # Decode the generated text
    simplified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return simplified_text

# Function to calculate accuracy and BLEU score
def evaluate_model(model, tokenizer, sentences, complex_words, simplified_words):
    exact_match_count = 0
    bleu_scores = []
    smoothing_function = SmoothingFunction().method1  # Smoothing for BLEU score
    c=0
    for i in tqdm(range(25)):
        # c+=1
        # if c==20:
        #     break
        original_sentence = sentences[i]
        correct_simplified_word = simplified_words[i]  # Ground truth simplified word
        complex_word = complex_words[i]

        # Simplify the sentence using the model
        predicted_sentence = simplify_text(model, tokenizer, original_sentence)

        # Extract the simplified word from the predicted sentence
        # (Assuming we replaced the complex word in the sentence)
        if complex_word in original_sentence:
            predicted_simplified_word = predicted_sentence.replace(original_sentence.replace(complex_word, ''), '').strip()
        else:
            predicted_simplified_word = predicted_sentence

        # Check for exact match
        if predicted_simplified_word == correct_simplified_word:
            exact_match_count += 1

        # Compute BLEU score
        reference = [correct_simplified_word.split()]  # Ground truth as reference
        candidate = predicted_simplified_word.split()  # Predicted as candidate
        bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
        bleu_scores.append(bleu_score)

    # Calculate exact match accuracy
    exact_match_accuracy = exact_match_count / len(sentences)

    # Calculate average BLEU score
    average_bleu_score = sum(bleu_scores) / len(bleu_scores)

    return exact_match_accuracy, average_bleu_score

# Load the fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = './fine_tuned_t5'  # Path to the fine-tuned model
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)

# Load the dataset for testing
file_path = 'BenchLS.txt'  # Path to your BenchLS.txt file
sentences, complex_words, simplified_words = load_benchls_data(file_path)

# Evaluate the model
exact_match_accuracy, average_bleu_score = evaluate_model(model, tokenizer, sentences, complex_words, simplified_words)

print(f"Exact Match Accuracy: {exact_match_accuracy * 100:.2f}%")
print(f"Average BLEU Score: {average_bleu_score:.4f}")


100%|██████████| 25/25 [00:51<00:00,  2.08s/it]

Exact Match Accuracy: 0.00%
Average BLEU Score: 0.0000





In [6]:
def load_benchls_data(file_path):
    sentences = []
    complex_words = []
    simplified_words = []

    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            
            # Ensure the line has at least 3 parts: sentence, complex word, and simplified options
            if len(parts) >= 3:
                sentence = parts[0]
                complex_word = parts[1]
                
                # Get the first simplified word as ground truth (you can modify this if needed)
                simplified_word = parts[3].split(':')[1]  # Taking the first simplified word
                
                sentences.append(sentence)
                complex_words.append(complex_word)
                simplified_words.append(simplified_word)

    return sentences, complex_words, simplified_words
