In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to |the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/clan-data/CLAN_data.csv
/kaggle/input/clan-data/CLAN_data_cleaned.csv
/kaggle/input/clandataset/test_data_cleaned.csv
/kaggle/input/clandataset/CLAN_data.csv
/kaggle/input/clandataset/test.csv
/kaggle/input/clandataset/CLAN_data_cleaned.csv


In [13]:
pip install transformers datasets evaluate rouge-score sacrebleu bert-score --upgrade --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m75.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hNote: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration, 
    Trainer, TrainingArguments, 
    DataCollatorForSeq2Seq, TrainerCallback
)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bertscore

In [23]:
if(torch.cuda.is_available):
    print("Mal hain")
else:
    print("nahi hain")

Mal hain


In [24]:
# --- 2. Load and Prepare Data ---
df = pd.read_csv("/kaggle/input/clandataset/CLAN_data_cleaned.csv")
df = df[["Social Media Post", "Normalized Claim"]].dropna().drop_duplicates()
df = df[(df["Social Media Post"].str.strip() != "") & (df["Normalized Claim"].str.strip() != "")]

train_df = df.sample(frac=0.7, random_state=42)
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)
test_df = temp_df.drop(val_df.index)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))


# --- 3. Tokenizer and Model ---
tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-small")


# --- 4. Preprocessing ---
def preprocess(example):
    input_enc = tokenizer(
        "normalize: " + example["Social Media Post"],
        max_length=256,
        padding="max_length",
        truncation=True
    )
    target_enc = tokenizer(
        example["Normalized Claim"],
        max_length=128,
        padding="max_length",
        truncation=True
    )

    input_enc["labels"] = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in target_enc["input_ids"]
    ]

    return input_enc

train_ds = train_ds.map(preprocess)
val_ds = val_ds.map(preprocess)
test_ds = test_ds.map(preprocess)

train_ds.set_format(type="torch")
val_ds.set_format(type="torch")
test_ds.set_format(type="torch")


# --- 5. Evaluation Callback ---
class EvalMetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        model.eval()
        predictions, references = [], []

        for i in range(min(100, len(val_ds))):
            sample = val_ds[i]
            input_ids = sample["input_ids"].unsqueeze(0).to(model.device)
            attn_mask = sample["attention_mask"].unsqueeze(0).to(model.device)
            labels = sample["labels"]

            with torch.no_grad():
                gen_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attn_mask,
                    max_length=128,
                    num_beams=4
                )
            
            pred = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
            ref = tokenizer.decode([t for t in labels if t != -100], skip_special_tokens=True)

            predictions.append(pred)
            references.append(ref)

        # Compute BLEU-4
        bleu_scores = [
            sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
            for pred, ref in zip(predictions, references)
        ]
        print(f"\nEpoch {int(state.epoch)} Metrics:")
        print("BLEU-4    :", round(sum(bleu_scores) / len(bleu_scores), 4))

        # ROUGE-L
        rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_scores = [rouge.score(r, p)['rougeL'].fmeasure for p, r in zip(predictions, references)]
        print("ROUGE-L   :", round(sum(rouge_scores) / len(rouge_scores), 4))

        # BERTScore
        P, R, F1 = bertscore(predictions, references, lang="en", verbose=False)
        print("BERTScore :", round(F1.mean().item(), 4))


# --- 6. Training ---
training_args = TrainingArguments(
    output_dir="./t5_clan",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    callbacks=[EvalMetricsCallback()]
)

trainer.train()
trainer.save_model("./t5_clan")
tokenizer.save_pretrained("./t5_clan")


# --- 7. Predict One Sample ---
sample = test_ds[0]
input_ids = sample["input_ids"].unsqueeze(0).to(model.device)
attn_mask = sample["attention_mask"].unsqueeze(0).to(model.device)

with torch.no_grad():
    output = model.generate(input_ids=input_ids, attention_mask=attn_mask, max_length=128)

print("\nPrediction Example:")
print("Input     :", tokenizer.decode(sample["input_ids"], skip_special_tokens=True))
print("Prediction:", tokenizer.decode(output[0], skip_special_tokens=True))
print("Reference :", tokenizer.decode([i for i in sample["labels"] if i != -100], skip_special_tokens=True))


Map:   0%|          | 0/1597 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

  trainer = Trainer(
  The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).


Epoch,Training Loss,Validation Loss
1,3.2131,2.910469
2,2.9375,2.804991
3,2.8765,2.76017
4,2.8211,2.739943
5,2.8024,2.732935



Epoch 1 Metrics:
BLEU-4    : 0.2163
ROUGE-L   : 0.386


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore : 0.8833





Epoch 2 Metrics:
BLEU-4    : 0.2224
ROUGE-L   : 0.3918


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore : 0.886





Epoch 3 Metrics:
BLEU-4    : 0.2357
ROUGE-L   : 0.4021


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore : 0.8956





Epoch 4 Metrics:
BLEU-4    : 0.2364
ROUGE-L   : 0.4021


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore : 0.8956





Epoch 5 Metrics:
BLEU-4    : 0.2375
ROUGE-L   : 0.4032


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore : 0.8958


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Prediction Example:
Input     : normalize: warm water therapy dr. d. mensah asare says that the main benefits of warm water are numerous. a group of japanese doctors confirmed that warm water is 100 effective in resolving some health problems?, including headache, migraine, high blood pressure, low blood pressure, pain of joints, sudden increasing and decreasing of heartbeat, epilepsy, increasing level of cholesterol, cough, bodily discomfort, golu pain, asthma, hooping cough, blockage of veins, diseases related to uterus urine, stomach problems, poor appetite, also all related diseases to the eyes, ear throat. how to use warm water? drink 4 glasses of warm water not above 100 degree centigrade temperature on empty stomach first thing in the morning and eat nothing until an hour later you may not be able to make 4 glasses at the beginning, but slowly you will. the warm water therapy will resolve these health issues within reasonable periods?? diabetes in 30 days? blood pressure in 30 

# Test Pipeline

In [26]:
# --- 1. Imports ---
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from tqdm import tqdm


# --- 2. Load Test Data ---
df = pd.read_csv("/kaggle/input/clan-complete/CLAN_data_cleaned_test.csv")
df = df[["Social Media Post", "Normalized Claim"]].dropna().drop_duplicates()
df = df[(df["Social Media Post"].str.strip() != "") & (df["Normalized Claim"].str.strip() != "")]


# --- 3. Load Trained Model and Tokenizer ---
model_path = "./t5_clan"  # same as used in training
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


# --- 4. Inference Function ---
def generate_prediction(text):
    input_text = "normalize: " + text
    encodings = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    input_ids = encodings["input_ids"].to(device)
    attn_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        gen_ids = model.generate(input_ids=input_ids, attention_mask=attn_mask, max_length=128, num_beams=4)

    return tokenizer.decode(gen_ids[0], skip_special_tokens=True)


# --- 5. Predict on Test Data ---
predictions = []
for text in tqdm(df["Social Media Post"]):
    pred = generate_prediction(text)
    predictions.append(pred)

df["Predicted Claim"] = predictions


# --- 6. Evaluation Metrics ---
references = df["Normalized Claim"].tolist()
preds = df["Predicted Claim"].tolist()

# BLEU-4
bleu_scores = [
    sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
    for ref, pred in zip(references, preds)
]
bleu4 = round(sum(bleu_scores) / len(bleu_scores), 4)

# ROUGE-L
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [rouge.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(references, preds)]
rougeL = round(sum(rouge_scores) / len(rouge_scores), 4)

# BERTScore
_, _, f1 = bertscore(preds, references, lang="en", verbose=False)
bert_score = round(f1.mean().item(), 4)


# --- 7. Output Results ---
print("\nFinal Evaluation on Test Set:")
print("BLEU-4    :", bleu4)
print("ROUGE-L   :", rougeL)
print("BERTScore :", bert_score)

# Save predictions to CSV
df.to_csv("T5_Test_Predictions.csv", index=False)


100%|██████████| 471/471 [03:35<00:00,  2.18it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Final Evaluation on Test Set:
BLEU-4    : 0.1573
ROUGE-L   : 0.3312
BERTScore : 0.8791


In [None]:
import shutil

# Compress the folder
shutil.make_archive("/kaggle/working/t5_clan", 'zip', "/kaggle/working/t5_clan")
