# Text Processing with GPT-2 and T5
Install required packages first

In [None]:
!pip install transformers torch nltk

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, T5ForConditionalGeneration, T5Tokenizer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

In [None]:
class GPT2Generator:
    def __init__(self, model_name='gpt2-medium'):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)
        
    def generate_false_statements(self, text, num_samples=3, max_length=50):
        input_ids = self.tokenizer.encode(text, return_tensors='pt').to(self.device)
        
        outputs = self.model.generate(
            input_ids,
            do_sample=True,
            num_return_sequences=num_samples,
            max_length=max_length,
            temperature=0.9,
            top_k=50,
            top_p=0.95,
            no_repeat_ngram_size=2
        )
        
        statements = []
        for output in outputs:
            generated_text = self.tokenizer.decode(output, skip_special_tokens=True)
            if generated_text != text:  # Avoid exact matches
                statements.append(generated_text)
                
        return statements

In [None]:
class T5Generator:
    def __init__(self, model_name='t5-base'):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)
        
    def generate_false_statement(self, sentence, num_candidates=3):
        prefix = "paraphrase: "
        input_text = prefix + sentence
        
        input_ids = self.tokenizer(input_text, return_tensors='pt').input_ids.to(self.device)
        
        outputs = self.model.generate(
            input_ids,
            num_return_sequences=num_candidates,
            do_sample=True,
            max_length=128,
            top_k=120,
            top_p=0.95,
            temperature=0.8,
            num_beams=4
        )
        
        paraphrases = []
        for output in outputs:
            paraphrase = self.tokenizer.decode(output, skip_special_tokens=True)
            if paraphrase != sentence:  # Avoid exact matches
                paraphrases.append(paraphrase)
                
        return paraphrases

In [None]:
# Test both models
test_text = "The Earth revolves around the Sun."

# Initialize generators
gpt2_gen = GPT2Generator()
t5_gen = T5Generator()

print("GPT-2 Generated Statements:")
gpt2_results = gpt2_gen.generate_false_statements(test_text)
for i, stmt in enumerate(gpt2_results, 1):
    print(f"{i}. {stmt}")

print("\nT5 Generated Statements:")
t5_results = t5_gen.generate_false_statement(test_text)
for i, stmt in enumerate(t5_results, 1):
    print(f"{i}. {stmt}")

In [None]:
# Process longer text
def process_text(text, generator, sentences_per_statement=3):
    # Split text into sentences
    sentences = sent_tokenize(text)
    
    results = []
    for sentence in sentences:
        if len(sentence.split()) > 5:  # Only process sentences with more than 5 words
            if isinstance(generator, GPT2Generator):
                variations = generator.generate_false_statements(sentence, sentences_per_statement)
            else:
                variations = generator.generate_false_statement(sentence, sentences_per_statement)
            
            results.append({
                'original': sentence,
                'variations': variations
            })
    
    return results

In [None]:
# Test with a longer text
test_paragraph = """
Artificial intelligence has transformed many aspects of modern life. 
Machine learning algorithms can now recognize patterns in vast amounts of data. 
Neural networks have revolutionized image and speech recognition tasks.
"""

print("Processing with GPT-2:")
gpt2_results = process_text(test_paragraph, gpt2_gen)
for result in gpt2_results:
    print(f"\nOriginal: {result['original']}")
    print("Variations:")
    for i, var in enumerate(result['variations'], 1):
        print(f"{i}. {var}")

print("\nProcessing with T5:")
t5_results = process_text(test_paragraph, t5_gen)
for result in t5_results:
    print(f"\nOriginal: {result['original']}")
    print("Variations:")
    for i, var in enumerate(result['variations'], 1):
        print(f"{i}. {var}")