In [11]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from pathlib import Path
import pandas as pd

In [12]:
# Check for CUDA availability and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [13]:
# Define directories
DATA_DIR = Path('/home/abdellah-ennajari/Desktop/AI-Powered-Academic-Research-Assistant')
PROCESSED_DIR = DATA_DIR / 'Data/processed'

In [14]:
# Verify if the processed data directory exists
if not PROCESSED_DIR.exists():
    raise FileNotFoundError(f"Directory not found: {PROCESSED_DIR}")


In [15]:
# Load cleaned data
papers_df = pd.read_csv(PROCESSED_DIR / 'cleaned_papers.csv')

In [16]:
# Initialize the T5 tokenizer and model
import sentencepiece

model_name = "t5-small"
try:
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
except Exception as e:
    print(f"Error loading model or tokenizer: {str(e)}")
    raise

  state_dict = torch.load(resolved_archive_file, map_location="cpu")


In [17]:
# Define the summarization function
def summarize_text(text, max_length=150):
    """
    Summarize a given text using the T5 model.
    
    Args:
        text (str): Input text to summarize
        max_length (int): Maximum length of the summary
        
    Returns:
        str: Generated summary
    """
    try:
        # Preprocess the text for T5
        input_text = "summarize: " + text
        
        # Tokenize the input text
        inputs = tokenizer(input_text, 
                           return_tensors="pt",
                           max_length=512,
                           padding=True,
                           truncation=True).to(device)
        
        # Generate summary
        summary_ids = model.generate(
            inputs.input_ids,
            max_length=max_length,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
        
        # Decode the generated summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    
    except Exception as e:
        print(f"Error in summarization: {str(e)}")
        return None

In [18]:
# Example usage of the summarization function
try:
    example_paper = papers_df.iloc[0]
    paper_title = example_paper['title']
    paper_text = "Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns, and make decisions with minimal human intervention."
    
    # Display paper title and original text
    print(f"Paper Title: {paper_title}")
    print(f"Original Text: {paper_text}")
    
    # Generate summary
    summary = summarize_text(paper_text)
    if summary:
        print(f"Summary: {summary}")
    
    # Save the model and tokenizer
    model.save_pretrained(PROCESSED_DIR / 'summarization_model')
    tokenizer.save_pretrained(PROCESSED_DIR / 'summarization_tokenizer')
    print("\nSummarization model and tokenizer saved successfully.")

except Exception as e:
    print(f"Error in example usage: {str(e)}")

Paper Title: Uso de herramientas digitales matem\'aticas en la Educaci\'on Secundaria
Original Text: Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns, and make decisions with minimal human intervention.


2025-02-09 16:59:42.868739: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739116782.917084    4068 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739116782.933494    4068 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-09 16:59:43.048287: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Summary: machine learning is based on the idea that systems can learn from data, identify patterns, and make decisions with minimal human intervention.

Summarization model and tokenizer saved successfully.
