In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com




In [2]:
from datasets import load_dataset

df = load_dataset("knkarthick/dialogsum")

In [3]:
df

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
df['train'][1]['dialogue']

"#Person1#: Hello Mrs. Parker, how have you been?\n#Person2#: Hello Dr. Peters. Just fine thank you. Ricky and I are here for his vaccines.\n#Person1#: Very well. Let's see, according to his vaccination record, Ricky has received his Polio, Tetanus and Hepatitis B shots. He is 14 months old, so he is due for Hepatitis A, Chickenpox and Measles shots.\n#Person2#: What about Rubella and Mumps?\n#Person1#: Well, I can only give him these for now, and after a couple of weeks I can administer the rest.\n#Person2#: OK, great. Doctor, I think I also may need a Tetanus booster. Last time I got it was maybe fifteen years ago!\n#Person1#: We will check our records and I'll have the nurse administer and the booster as well. Now, please hold Ricky's arm tight, this may sting a little."

In [5]:
df['train'][1]['summary']

'Mrs Parker takes Ricky for his vaccines. Dr. Peters checks the record and then gives Ricky a vaccine.'

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer




In [7]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

**DATA EXPLORATION**

1.   DIMENSION




In [9]:
import pandas as pd
train_df=pd.DataFrame(df['train'])
test_df=pd.DataFrame(df['test'])
print("shape of training:",train_df.shape)
print("shape of test:",test_df.shape)

shape of training: (12460, 4)
shape of test: (1500, 4)


2.)STATISTICAL SUMMARY

In [10]:
print(train_df.describe(include="all"))

                 id                                           dialogue  \
count         12460                                              12460   
unique        12460                                              12458   
top     train_12459  #Person1#: Any plans tonight? \n#Person2#: Not...   
freq              1                                                  2   

                                                 summary     topic  
count                                              12460     12460  
unique                                             12436      7434  
top     #Person1# thanks #Person2# for #Person2#'s help.  shopping  
freq                                                   2       174  


3.)CHECK DUPLICATES


In [11]:
# Duplicates only in summaries
print("Duplicate summaries:", train_df["summary"].duplicated().sum())

# Which summaries are duplicated
dup_summaries = train_df[train_df["summary"].duplicated(keep=False)]
print(dup_summaries[["summary", "dialogue"]].head(10))
print("Duplicate dialogues:", train_df["dialogue"].duplicated().sum())
print("Duplicate dialogue-summary pairs:", train_df.duplicated(subset=["dialogue","summary"]).sum())
print("Duplicate topics:", train_df["topic"].duplicated().sum())


Duplicate summaries: 24
                                                summary  \
250   #Person1# explains to #Person2# the details ab...   
254   #Person1# thinks Tom is too old to be trick-or...   
1085   #Person1# thanks #Person2# for #Person2#'s help.   
1178  #Person1# is surprised at Taylor whose hobbies...   
1237  #Person1# complains about the strict dressing ...   
1550  #Person2# wants a trim. #Person1# serves #Pers...   
1578  #Person2# offers #Person1# some suggestions on...   
1976  #Person2# wants a trim. #Person1# serves #Pers...   
2026  #Person1# takes several pictures for #Person2#...   
2496  #Person1#'s going shopping for groceries but d...   

                                               dialogue  
254   #Person1#: Tom, aren't you a little too old to...  
1085  #Person1#: I'd like to take this opportunity t...  
1178  #Person1#: Are you going to watch the Indy Fiv...  
1237  #Person1#: I met a girl in elevator this morni...  
1550  #Person1#: Hello there! Come o

4.)Missing values


In [12]:
missing = train_df.isnull().sum()
missing_percent = (missing / len(train_df)) * 100
print("\nMissing Values per Column:")
print(pd.DataFrame({"Missing": missing, "Percentage": missing_percent}))


Missing Values per Column:
          Missing  Percentage
id              0         0.0
dialogue        0         0.0
summary         0         0.0
topic           0         0.0


5.)Descriptive Analysis

In [13]:
print("\nDescriptive Analysis of Dialogue Length:")
train_df["dialogue_length"] = train_df["dialogue"].apply(lambda x: len(x.split()))
print(train_df["dialogue_length"].describe())


Descriptive Analysis of Dialogue Length:
count    12460.000000
mean       130.986998
std         70.976634
min         40.000000
25%         85.000000
50%        116.000000
75%        163.000000
max        985.000000
Name: dialogue_length, dtype: float64


DATA PREPROCESSING

1.   REMOVING UNWANTED COLUMNS






In [14]:
train_df = train_df.drop(columns=["id"],errors="ignore")
test_df = test_df.drop(columns=["id"],errors="ignore")
train_df = train_df.drop(columns=["topic"],errors="ignore")
test_df = test_df.drop(columns=["topic"],errors="ignore")
train_df = train_df.drop(columns=["dialogue_length"],errors="ignore")
test_df = test_df.drop(columns=["dialogue_length"],errors="ignore")
train_df.head()

Unnamed: 0,dialogue,summary
0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw..."
1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...
2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...
3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...
4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...


 3. Clean text (HTML, URLs, junk chars, spaces)


In [15]:
import re
def clean_text(text):
    text = re.sub(r"<.*?>", "", text)                       # remove HTML tags
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)     # remove URLs
    text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\-\s]", "", text) # remove junk chars
    text = re.sub(r"\s+", " ", text).strip()                # normalize spaces
    return text

train_df["dialogue"] = train_df["dialogue"].apply(clean_text)
train_df["summary"] = train_df["summary"].apply(clean_text)
test_df["dialogue"] = test_df["dialogue"].apply(clean_text)
test_df["summary"] = test_df["summary"].apply(clean_text)

**Tokenization**

1.Convert columns to Python lists




In [16]:
train_dialogues = train_df["dialogue"].tolist()
train_summaries = train_df["summary"].tolist()

test_dialogues = test_df["dialogue"].tolist()
test_summaries = test_df["summary"].tolist()

2) Tokenize source (dialogues) — batch encode, returns dict with input_ids & attention_mask

In [17]:
train_source_encodings = tokenizer(
    train_dialogues,
    truncation=True,
    padding="max_length",
    max_length=512
)
# train_source_encodings is a dict: {"input_ids": [...], "attention_mask": [...]}

test_source_encodings = tokenizer(
    test_dialogues,
    truncation=True,
    padding="max_length",
    max_length=512
)

3) Tokenize targets (summaries) using tokenizer.as_target_tokenizer()


In [18]:
#    This ensures target tokenization follows seq2seq conventions
with tokenizer.as_target_tokenizer():
    train_target_encodings = tokenizer(
        train_summaries,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    test_target_encodings = tokenizer(
        test_summaries,
        truncation=True,
        padding="max_length",
        max_length=128
    )
# train_target_encodings is a dict with "input_ids" for labels




4.Convert pad_token_id in target input_ids to -100 (so loss ignores padding)

In [19]:
#    We'll create a new `labels` list where every pad token becomes -100
pad_id = tokenizer.pad_token_id

train_labels = []
for label_seq in train_target_encodings["input_ids"]:
    # map pad -> -100
    mapped = [(tok if tok != pad_id else -100) for tok in label_seq]
    train_labels.append(mapped)

test_labels = []
for label_seq in test_target_encodings["input_ids"]:
    mapped = [(tok if tok != pad_id else -100) for tok in label_seq]
    test_labels.append(mapped)

5) Assemble final encodings dictionaries

In [20]:
train_encodings = {
    "input_ids": train_source_encodings["input_ids"],
    "attention_mask": train_source_encodings["attention_mask"],
    "labels": train_labels
}

test_encodings = {
    "input_ids": test_source_encodings["input_ids"],
    "attention_mask": test_source_encodings["attention_mask"],
    "labels": test_labels
}

In [21]:
print("Train examples:", len(train_encodings["input_ids"]))
print("Test examples:", len(test_encodings["input_ids"]))

print("Example input_ids length:", len(train_encodings["input_ids"][0]))
print("Example attention_mask length:", len(train_encodings["attention_mask"][0]))
print("Example labels length:", len(train_encodings["labels"][0]))
print("First 10 label tokens (showing -100 for pads):", train_encodings["labels"][8][:100])

Train examples: 12460
Test examples: 1500
Example input_ids length: 512
Example attention_mask length: 512
Example labels length: 128
First 10 label tokens (showing -100 for pads): [0, 41761, 134, 924, 10, 3280, 3034, 3737, 7, 18404, 176, 4, 18404, 176, 4265, 24, 18, 205, 8, 40, 185, 24, 4, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [22]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    print("CUDA runtime version:", torch.version.cuda)
    print("cuDNN version:", torch.backends.cudnn.version())

CUDA available: True
GPU: NVIDIA GeForce RTX 3050 A Laptop GPU
CUDA runtime version: 12.1
cuDNN version: 90100


**FINE TUNING THE MODEL**

In [23]:
import torch

class Seq2SeqDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

train_dataset = Seq2SeqDataset(train_encodings)
test_dataset = Seq2SeqDataset(test_encodings)


In [24]:
training_args = TrainingArguments( output_dir="/content",
per_device_train_batch_size=8,
num_train_epochs=2,
remove_unused_columns=False)


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()


Step,Training Loss
500,1.8463
1000,1.6437
1500,1.566
2000,1.3704
2500,1.3172
3000,1.2966




TrainOutput(global_step=3116, training_loss=1.499437794911846, metrics={'train_runtime': 8681.8061, 'train_samples_per_second': 2.87, 'train_steps_per_second': 0.359, 'total_flos': 7597316269670400.0, 'train_loss': 1.499437794911846, 'epoch': 2.0})

In [26]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print(eval_results)

{'eval_loss': 1.6155582666397095, 'eval_runtime': 43.0609, 'eval_samples_per_second': 34.834, 'eval_steps_per_second': 4.366, 'epoch': 2.0}


#### SAVING THE MODEL

In [30]:
trainer.save_model("final_model")
tokenizer.save_pretrained("final_model")



('final_model\\tokenizer_config.json',
 'final_model\\special_tokens_map.json',
 'final_model\\vocab.json',
 'final_model\\merges.txt',
 'final_model\\added_tokens.json',
 'final_model\\tokenizer.json')