<a href="https://colab.research.google.com/github/MHmi1/smart-text-analysis-dj/blob/master/EN_Text_Summarization_DeepModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from datasets import load_dataset

# Load the BART tokenizer and model
model_name = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Load the CNN/Daily Mail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0', split='train')

# Get the source texts and summaries from the dataset
source_texts = dataset['article'][:100]   # Using the first 100 samples for demonstration
summaries = dataset['highlights'][:100]   # Corresponding highlights are the abstractive summaries

# Tokenize the source texts and summaries
tokenized_inputs = tokenizer(source_texts, padding=True, truncation=True, return_tensors='pt')
tokenized_targets = tokenizer(summaries, padding=True, truncation=True, return_tensors='pt')

# Ensure inputs and targets have the same length
assert len(tokenized_inputs['input_ids']) == len(tokenized_targets['input_ids'])

# Prepare the DataLoader for faster training
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'],
                              tokenized_targets['input_ids'], tokenized_targets['attention_mask'])
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Fine-tuning loop with transfer learning
optimizer = AdamW(model.parameters(), lr=1e-7)
num_epochs = 2

model.train()

for epoch in range(num_epochs):
    total_loss = 0

    for batch in train_loader:
        batch_inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'decoder_input_ids': batch[2],
            'decoder_attention_mask': batch[3]
        }

        optimizer.zero_grad()

        outputs = model(**batch_inputs, labels=batch_inputs['decoder_input_ids'])
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {average_loss:.4f}")

# Save the fine-tuned model
output_model_dir = './fine_tuned_model/'
model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)




Epoch 1/2 - Average Loss: 11.0308
Epoch 2/2 - Average Loss: 10.7514


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

model_save_name = 'model.h5'
path = F"/content/gdrive/My Drive/{model_save_name}"
torch.save(model.state_dict(), path)

Mounted at /content/gdrive


In [19]:
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Function for summarization with tuned hyperparameters
def generate_summary(input_text, model, tokenizer, max_length=150, min_length=50, length_penalty=2.0, num_beams=4):
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=max_length, truncation=True)

    # Generate the summary using the model with tuned hyperparameters
    with torch.no_grad():
        summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length,
                                     length_penalty=length_penalty, num_beams=num_beams, early_stopping=True)


    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage on the website
user_input = input("Enter the passage to summarize:\n")
summary = generate_summary(user_input, model, tokenizer)
print("\nSummary:")
print(summary)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Enter the passage to summarize:

Summary:
Scientists have developed a new deep learning model that can accurately predict earthquakes. The model uses a combination of seismic data, satellite imagery, and other environmental factors to forecast seismic activity. It has demonstrated high accuracy in predicting the timing, location, and magnitude of earthquakes.
