# Install Packages

In [None]:
!pip install transformers datasets



# Import libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer
from datasets import load_dataset
from torch.cuda.amp import GradScaler, autocast
import os
warnings.filterwarnings("ignore")

# Stage 1 : Data Collection

In [None]:
def load_data():
    dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1000]")
    return dataset

# Stage 2: Data Preprocessing

In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_input_length=256, max_target_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        article = self.dataset[idx]["article"]
        summary = self.dataset[idx]["highlights"]

        article = "summarize: " + article

        input_encoding = self.tokenizer(
            article,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_encoding["input_ids"].squeeze()
        }


# Stage 3 : Model Building

In [None]:
class CustomTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=1024, dropout=0.1):
        super(CustomTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Parameter(torch.zeros(1, 512, d_model))  # Increased to 512
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
        src = src + self.pos_encoder[:, :src.size(1), :]
        tgt = self.embedding(tgt) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
        tgt = tgt + self.pos_encoder[:, :tgt.size(1), :]

        output = self.transformer(src.transpose(0, 1), tgt.transpose(0, 1), src_mask, tgt_mask)
        output = self.fc_out(output)
        return output.transpose(0, 1)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask



In [None]:
def load_model_and_tokenizer():
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = CustomTransformer(vocab_size=tokenizer.vocab_size)
    return model, tokenizer

# Stage 4 : Model Training

In [None]:
def train_model(model, dataloader, tokenizer, epochs=1, device="cuda" if torch.cuda.is_available() else "cpu"):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    scaler = GradScaler()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()

            decoder_input = labels[:, :-1]
            decoder_target = labels[:, 1:]

            tgt_mask = model.generate_square_subsequent_mask(decoder_input.size(1)).to(device)

            with autocast():
                output = model(input_ids, decoder_input, tgt_mask=tgt_mask)
                loss = criterion(output.reshape(-1, output.size(-1)), decoder_target.reshape(-1))

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

    # Save model and tokenizer
    torch.save(model.state_dict(), "summarizer_custom_model.pt")
    os.makedirs("summarizer_custom_model", exist_ok=True)
    tokenizer.save_pretrained("summarizer_custom_model")

# Stage 5 : Summarization

In [None]:
def summarize_text(model, tokenizer, text, max_length=128, min_length=30, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.eval()
    model = model.to(device)

    text = "summarize: " + text

    encoding = tokenizer(
        text,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)

    generated_ids = beam_search(model, tokenizer, input_ids, max_length, min_length, device)
    summary = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return summary



In [None]:
def beam_search(model, tokenizer, input_ids, max_length, min_length, device, beam_size=4):
    model.eval()
    sequences = [(input_ids, 0.0)]
    for step in range(max_length):
        all_candidates = []
        for seq, score in sequences:
            decoder_input = seq[:, -1:].to(device) if step == 0 else seq[:, 1:].to(device)
            tgt_mask = model.generate_square_subsequent_mask(decoder_input.size(1)).to(device)
            output = model(input_ids, decoder_input, tgt_mask=tgt_mask)
            log_probs = torch.log_softmax(output[:, -1, :], dim=-1)
            topk_log_probs, topk_ids = log_probs.topk(beam_size)

            for i in range(beam_size):
                candidate_seq = torch.cat([seq, topk_ids[:, i].unsqueeze(1)], dim=1)
                candidate_score = score - topk_log_probs[0, i].item()
                all_candidates.append((candidate_seq, candidate_score))

        sequences = sorted(all_candidates, key=lambda x: x[1])[:beam_size]

        if step >= min_length and all(tokenizer.eos_token_id in seq[0] for seq, _ in sequences):
            break

    return sequences[0][0][0]

# Stage 6: Execution

In [49]:
def main():
    # Load data
    dataset = load_data()

    # Initialize model and tokenizer
    model, tokenizer = load_model_and_tokenizer()

    # Preprocess data
    train_dataset = SummarizationDataset(dataset, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    # Train model
    train_model(model, train_dataloader, tokenizer)

    # Test summarization
    sample_text = """
    The quick brown fox jumps over the lazy dog. This is a classic pangram used to test typewriters and keyboards.
    It contains every letter of the English alphabet. The fox is known for its agility and cunning, while the dog,
    in this case, is depicted as idle. This sentence has been used in various contexts to demonstrate text processing.
    The pangram is often employed in design and development to ensure that fonts and text rendering systems display
    all characters correctly. Its brevity and inclusivity make it a practical tool for testing.
    """
    summary = summarize_text(model, tokenizer, sample_text)
    print("Original Text:", sample_text)
    print("Original Text Word Count:", len(sample_text.split()))
    print("Summary:", summary)
    print("Summary Word Count:", len(summary.split()))

if __name__ == "__main__":
    main()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-49-0f2141fe5d23>", line 30, in <cell line: 0>
    main()
  File "<ipython-input-49-0f2141fe5d23>", line 13, in main
    train_model(model, train_dataloader, tokenizer)
  File "<ipython-input-48-d6be2a9049b2>", line 26, in train_model
    scaler.scale(loss).backward()
  File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 626, in backward
    torch.autograd.backward(
  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 347, in backward
    _engine_run_backward(
  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 823, in _engine_run_backward
    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

TypeError: object of type 'NoneType' has no len()

# Loading Custom Model

In [None]:
import torch
import torch.nn as nn
from transformers import T5Tokenizer
import os

class CustomTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=1024, dropout=0.1):
        super(CustomTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Parameter(torch.zeros(1, 256, d_model))  # Increased to 512
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
        src = src + self.pos_encoder[:, :src.size(1), :]
        tgt = self.embedding(tgt) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
        tgt = tgt + self.pos_encoder[:, :tgt.size(1), :]

        output = self.transformer(src.transpose(0, 1), tgt.transpose(0, 1), src_mask, tgt_mask)
        output = self.fc_out(output)
        return output.transpose(0, 1)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask


def load_model_and_tokenizer(model_dir="summarizer_custom_model", device="cuda" if torch.cuda.is_available() else "cpu"):
    if not os.path.exists(model_dir):
        raise FileNotFoundError(
            f"Model directory '{model_dir}' not found. Ensure you have run the training script "
            "and the model was saved correctly."
        )

    try:
        tokenizer = T5Tokenizer.from_pretrained(model_dir)
        model = CustomTransformer(vocab_size=tokenizer.vocab_size)
        model.load_state_dict(torch.load('/content/summarizer_model.pt', map_location=device))
        model = model.to(device)
        model.eval()
        return model, tokenizer
    except Exception as e:
        raise RuntimeError(f"Error loading model or tokenizer: {str(e)}")

def summarize_text(model, tokenizer, text, max_length=128, min_length=30, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.eval()
    model = model.to(device)

    # Add prefix
    text = "summarize: " + text

    # Tokenize input text
    encoding = tokenizer(
        text,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)

    generated_ids = beam_search(model, tokenizer, input_ids, max_length, min_length, device)
    summary = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return summary

def beam_search(model, tokenizer, input_ids, max_length, min_length, device, beam_size=4):
    model.eval()
    sequences = [(input_ids, 0.0)]
    for step in range(max_length):
        all_candidates = []
        for seq, score in sequences:
            decoder_input = seq[:, -1:].to(device) if step == 0 else seq[:, 1:].to(device)
            tgt_mask = model.generate_square_subsequent_mask(decoder_input.size(1)).to(device)
            output = model(input_ids, decoder_input, tgt_mask=tgt_mask)
            log_probs = torch.log_softmax(output[:, -1, :], dim=-1)
            topk_log_probs, topk_ids = log_probs.topk(beam_size)

            for i in range(beam_size):
                candidate_seq = torch.cat([seq, topk_ids[:, i].unsqueeze(1)], dim=1)
                candidate_score = score - topk_log_probs[0, i].item()
                all_candidates.append((candidate_seq, candidate_score))

        sequences = sorted(all_candidates, key=lambda x: x[1])[:beam_size]

        if step >= min_length and all(tokenizer.eos_token_id in seq[0] for seq, _ in sequences):
            break

    return sequences[0][0][0]


def main():
    try:
        model, tokenizer = load_model_and_tokenizer()
    except Exception as e:
        print(e)
        return

    sample_text = """
    ### **India: A Land of Diversity and Heritage**

#### **Introduction**

India, the seventh-largest country by land area and the second-most populous nation, is a land of immense diversity, rich history, and vibrant culture. Nestled in South Asia, India is bordered by Pakistan, China, Nepal, Bhutan, Bangladesh, and Myanmar, with the Indian Ocean to the south. It is known for its ancient civilization, vast landscapes, and contributions to global science, art, literature, and philosophy. From the towering Himalayas in the north to the serene backwaters of Kerala in the south, India is a nation that embodies unity in diversity.

This essay explores India's history, geography, culture, economy, governance, challenges, and its role on the global stage.

---

## **Historical Background**

### **Ancient India and Civilization**

India has one of the world's oldest civilizations, dating back to the Indus Valley Civilization (3300–1300 BCE). The cities of Harappa and Mohenjo-Daro exemplify advanced urban planning, with well-structured drainage systems and standardized brick sizes. The Vedic Age (1500–500 BCE) followed, introducing Sanskrit literature, early Hinduism, and the caste system.

During the Mauryan Empire (321–185 BCE), Emperor Ashoka played a significant role in spreading Buddhism across Asia. The Gupta Empire (4th–6th century CE) marked the golden age of Indian culture, advancing in mathematics (introduction of zero), astronomy, literature, and medicine.

### **Medieval India**

Medieval India witnessed the rise of regional kingdoms and the influence of Islam with the Delhi Sultanate (1206–1526) and later the Mughal Empire (1526–1857). The Mughal rulers, especially Akbar, encouraged religious tolerance and built architectural wonders like the Taj Mahal.

### **Colonial Era and Independence Struggle**

The British East India Company established control over India in the 18th century, exploiting its resources. The Revolt of 1857, also known as the First War of Independence, was the first major resistance against British rule. However, it was only in the 20th century, under the leadership of Mahatma Gandhi, Jawaharlal Nehru, Subhas Chandra Bose, and other freedom fighters, that India gained independence on **August 15, 1947**. The partition of India led to the creation of Pakistan, resulting in significant communal violence and mass migration.

---

## **Geography and Natural Resources**

### **Diverse Terrain**

India spans 3.28 million square kilometers and has a varied topography. The **Himalayas** form the northern boundary, acting as a natural defense and climate regulator. The **Gangetic plains** are the most fertile regions, supporting India's massive agricultural sector. The **Thar Desert** in Rajasthan and the **Deccan Plateau** in the south showcase the nation's geographic extremes.

### **Rivers and Climate**

Major rivers like the **Ganga, Yamuna, Brahmaputra, Godavari, Krishna, and Kaveri** are crucial for irrigation, hydroelectricity, and sustenance. India experiences diverse climatic conditions—tropical in the south, temperate in the north, and desert-like in the west.

### **Natural Resources**

India is rich in minerals such as coal, iron ore, bauxite, and petroleum. The country is also a leading producer of agricultural commodities like rice, wheat, spices, and cotton.

---

## **Cultural and Linguistic Diversity**

### **Languages**

India recognizes **22 official languages**, with Hindi and English serving as the primary means of communication. Over **1,600 dialects** are spoken, reflecting the nation’s linguistic diversity.

### **Religious Pluralism**

India is the birthplace of **Hinduism, Buddhism, Jainism, and Sikhism**. It is also home to large populations of **Muslims, Christians, Jews, and Zoroastrians**, demonstrating religious harmony despite occasional conflicts.

### **Festivals and Traditions**

Festivals like **Diwali, Holi, Eid, Christmas, Pongal, Baisakhi**, and **Navratri** symbolize India's cultural vibrancy. Weddings, traditional attire, and culinary diversity further enrich Indian heritage.

### **Art and Literature**

Indian classical music—**Hindustani and Carnatic**—along with dance forms like **Bharatanatyam, Kathak, Odissi, and Kathakali**, are globally recognized. Literature from ancient texts like the **Vedas, Upanishads, Mahabharata, and Ramayana** to modern works by **Rabindranath Tagore, R.K. Narayan, and Arundhati Roy** showcases India's intellectual richness.

---

## **Indian Economy: A Growing Global Power**

### **Agriculture**

India is one of the largest producers of **rice, wheat, sugarcane, pulses, and dairy products**. Despite its strong agrarian roots, challenges like small landholdings and unpredictable monsoons affect productivity.

### **Industrial Growth**

India's industrial sector, particularly **automobiles, textiles, steel, and pharmaceuticals**, has witnessed significant growth. The country is the world's largest vaccine producer and a hub for generic medicines.

### **Information Technology and Services**

India's IT industry, with giants like **TCS, Infosys, and Wipro**, contributes significantly to GDP. The outsourcing sector, particularly in **Bangalore, Hyderabad, and Pune**, has transformed India into a global IT powerhouse.

### **Infrastructure and Space Exploration**

India's infrastructure development includes **metros, highways, bullet trains, and smart cities**. The **Indian Space Research Organisation (ISRO)** has made remarkable strides, launching missions like **Chandrayaan (Moon) and Mangalyaan (Mars)** at record-low costs.

---

## **Government and Politics**

India is the world's **largest democracy**, following a **parliamentary system**. The **President** is the constitutional head, while the **Prime Minister** holds executive power.

### **Judiciary and Constitution**

The **Indian Constitution**, adopted in 1950, is the world's longest. It guarantees fundamental rights, secularism, and social justice. The Supreme Court ensures legal protection for all citizens.

### **Challenges in Governance**

* **Corruption**: A major issue, though initiatives like Digital India and Aadhaar have improved transparency.
* **Communalism and Casteism**: Though India is largely secular, caste-based and religious conflicts still occur.
* **Unemployment and Poverty**: Economic reforms have reduced poverty, but income disparity remains high.

---

## **Global Influence and Foreign Policy**

### **United Nations and International Relations**

India is a key player in global politics, advocating **peace, non-alignment, and South-South cooperation**. It is a member of the **G20, BRICS, and United Nations Security Council (aspiring for a permanent seat)**.

### **Defense and Nuclear Power**

India has a strong military force and is a **nuclear power**, with a “No First Use” policy. The country is enhancing defense capabilities through projects like **Make in India**.

### **Trade and Diplomacy**

India has trade relations with **the USA, European Union, China, Russia, and neighboring countries**. The Indian diaspora in the **Middle East, North America, and the UK** significantly contributes to foreign remittances.

---

## **Challenges and the Road Ahead**

### **Economic Inequality**

India, despite being an emerging economy, has significant **wealth disparity**. Policies promoting rural development and skill enhancement can bridge the gap.

### **Environmental Issues**

Deforestation, air pollution, and water scarcity threaten India's sustainability. Renewable energy initiatives like **solar and wind power** are promising solutions.

### **Education and Healthcare**

Improving **public healthcare and education** is essential. The **National Education Policy (NEP 2020)** and **Ayushman Bharat scheme** aim to transform these sectors.

---

## **Conclusion**

India, with its ancient legacy and modern aspirations, is on the path of progress. As a rapidly growing economy and a cultural powerhouse, India continues to play a crucial role on the world stage. By embracing **technology, education, sustainability, and social justice**, India can achieve unparalleled growth in the 21st century.

India’s journey—from an ancient civilization to a modern global leader—remains an inspiring tale of resilience and ambition. 🚀

---

Would you like me to add references or more details on a specific section? 😊

    """
    summary = summarize_text(model, tokenizer, sample_text)
    # print("Original Text:", sample_text)
    print("Summary Word Count:", len(sample_text.split()))
    print("Summary:", summary)
    print("Summary Word Count:", len(summary.split()))

if __name__ == "__main__":
    main()

Error loading model or tokenizer: Error(s) in loading state_dict for CustomTransformer:
	size mismatch for embedding.weight: copying a param with shape torch.Size([32100, 256]) from checkpoint, the shape in current model is torch.Size([32000, 256]).
	size mismatch for fc_out.weight: copying a param with shape torch.Size([32100, 256]) from checkpoint, the shape in current model is torch.Size([32000, 256]).
	size mismatch for fc_out.bias: copying a param with shape torch.Size([32100]) from checkpoint, the shape in current model is torch.Size([32000]).


# Fine Tuning

In [53]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pickle
import os

def load_model_and_tokenizer(save_dir="t5_small_model", model_name="t5-small"):
    model_path = os.path.join(save_dir, "t5_small_model.pth")
    tokenizer_path = os.path.join(save_dir, "t5_small_tokenizer.pkl")

    if not (os.path.exists(model_path) and os.path.exists(tokenizer_path)):
        raise FileNotFoundError(f"Model or tokenizer not found in {save_dir}. Ensure t5_small_model.pth and t5_small_tokenizer.pkl exist.")

    model = T5ForConditionalGeneration.from_pretrained(model_name, local_files_only=False)
    model.load_state_dict(torch.load(model_path, map_location="cpu"))

    with open(tokenizer_path, "rb") as f:
        tokenizer = pickle.load(f)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    return model.to(device).eval(), tokenizer

def summarize_text(model, tokenizer, text, max_length=128, min_length=30, device="cuda" if torch.cuda.is_available() else "cpu"):
    text = "summarize: " + text
    encoding = tokenizer(
        text,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)
    generated_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        min_length=min_length,
        num_beams=4,
        length_penalty=1.0,
        early_stopping=True
    )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

def main():
    try:
        model, tokenizer = load_model_and_tokenizer()
    except Exception as e:
        print(f"Error: {e}")
        return

    input_text = input("Enter text to summarize: ").strip()
    if not input_text:
        print("Error: Input text cannot be empty.")
        return

    summary = summarize_text(model, tokenizer, input_text)
    print("\nInput Text:", input_text)
    print("Summary:", summary)
    print("Summary Word Count:", len(summary.split()))

if __name__ == "__main__":
    main()

Enter text to summarize: ### **India: A Land of Diversity and Heritage**  #### **Introduction**  India, the seventh-largest country by land area and the second-most populous nation, is a land of immense diversity, rich history, and vibrant culture. Nestled in South Asia, India is bordered by Pakistan, China, Nepal, Bhutan, Bangladesh, and Myanmar, with the Indian Ocean to the south. It is known for its ancient civilization, vast landscapes, and contributions to global science, art, literature, and philosophy. From the towering Himalayas in the north to the serene backwaters of Kerala in the south, India is a nation that embodies unity in diversity.  This essay explores India's history, geography, culture, economy, governance, challenges, and its role on the global stage.  ---  ## **Historical Background**  ### **Ancient India and Civilization**  India has one of the world's oldest civilizations, dating back to the Indus Valley Civilization (3300–1300 BCE). The cities of Harappa and Moh