In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'hindi-headline:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4843372%2F8180842%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240421%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240421T104642Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D962bea6a5230df94fbfd2a4e386fe6a69d29a762f6243007100fc8175bb562714be80127da9b93fac0cd664519ffa4ded9d5999433976276545d45828df6403ea780adfdad2b2b1e75c09110b37f35abf7bddcceeca16a8ca245c850310e7972ac5b4f8f3d4ffc89fad162b4d3130b388e8cc6c886b2fc5bc0db29cc9eafd70398b46af0d28c2dbd98bec9e454f6f9351af5e41482d2cd3492ddbdcc0c641ba64474066065663dc9f38663675ce779908b20285fdb2432d3c24f63ccd5883e8a70ec364b26b73d63d969db1fe6c2d18800a0b033d3dca9d22b964eb7325ee7406fe36762d80f6035360825eb69a2a0d341d8fa1f4743c4b49d06f05ac7c5b02d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading hindi-headline, 335003376 bytes compressed
Downloaded and uncompressed: hindi-headline
Data source import complete.


In [None]:
import pandas as pd
import json

### Conversting into dataframe

In [2]:
def load_jsonl_to_df(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            json_line = json.loads(line)
            data.append(json_line)
    df = pd.DataFrame(data)
    return df


In [3]:
train_df = load_jsonl_to_df('/kaggle/input/hindi-headline/hi_train.jsonl')
val_df = load_jsonl_to_df('/kaggle/input/hindi-headline/hi_dev.jsonl')
test_df = load_jsonl_to_df('/kaggle/input/hindi-headline/hi_test.jsonl')

In [4]:
train_df

Unnamed: 0,id,Document,Title,URL
0,1,कनाडा अमेरिका और यूरोपीय संघ का अनुसरण करते हु...,कनाडा ईरान पर से प्रतिबंध हटाएगा : विदेश मंत्री,
1,2,विदेशों में मूलधातुओं की कीमतों में कमजोरी के ...,"हाजिर मांग ने बढ़ाये तांबे के दाम, 0.18 प्रतिश...",
2,3,डेविड वॉर्नर पर क्रिकेट ऑस्ट्रेलिया ने 1 साल क...,डेविड वॉर्नर ने किया क्रिकेट के मैदान में वापस...,https://www.indiatv.in/sports/cricket-david-wa...
3,4,"अगर आपके पास फटे-पुराने नोट हैं, जिन्हें दुक...",किसी भी बैंक में बदल सकते हैं कटे-फटे और खरा...,
4,5,नोवेल लवासा ने देर रात बयान जारी कर कहा कि उन्...,आयकर विभाग के नाम से ईमेल भेज कर जानकारियां चु...,https://www.indiatv.in/india/national-election...
...,...,...,...,...
208086,208087,कराचीः पाकिस्तान के मुख्य चयनकर्ता इंज़माम उल ...,कोहली पर टिप्पणी को लेकर इंज़माम ने की एंडरसन ...,https://www.indiatv.in/sports/cricket-inzmam-c...
208087,208088,सेंट्रल बैंक ऑफ इंडिया वर्तमान में 4710 शाखाओं...,सेंट्रल बैंक शहरी क्षेत्रों में तैनात करेगा बै...,
208088,208089,वित्त मंत्रालय ने कॉरपोरेट भविष्य निधि(पीएफ) क...,कर छूट के लिए पीएफ ट्रस्ट को करना पड़ेगा एक सा...,
208089,208090,उत्तर प्रदेश सरकार ने दिग्गज वाहन कंपनियों मर्...,"मर्सिडीज, मार्कोपोलो को इकाई लगाने का न्योता",


In [5]:
print("Training Set Size:", train_df.shape)
print("Validation Set Size:", val_df.shape)
print("Test Set Size:", test_df.shape)

Training Set Size: (208091, 4)
Validation Set Size: (44718, 4)
Test Set Size: (44475, 4)


### Reducing the size of dataset

In [6]:
original_train_size = 208091
original_val_size = 44718
original_test_size = 44475
original_total_size = original_train_size + original_val_size + original_test_size

train_size = 8000

train_ratio = original_train_size / original_total_size
val_ratio = original_val_size / original_total_size
test_ratio = original_test_size / original_total_size

val_size = int(train_size * val_ratio / train_ratio)
test_size = int(train_size * test_ratio / train_ratio)

print("New Training Set Size:", train_size)
print("New Validation Set Size:", val_size)
print("New Test Set Size:", test_size)

New Training Set Size: 8000
New Validation Set Size: 1719
New Test Set Size: 1709


In [7]:
random_seed = 42

train_df = train_df.sample(n=train_size, random_state=random_seed)
val_df = val_df.sample(n=val_size, random_state=random_seed)
test_df = test_df.sample(n=test_size, random_state=random_seed)

# BART-base

In [None]:
import re

from transformers import BartForConditionalGeneration, BartTokenizer

from datasets import Dataset

from transformers import BartTokenizer

from transformers import Trainer, TrainingArguments

In [8]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

article_text = "कनाडा अमेरिका और यूरोपीय संघ का अनुसरण करते हुए ईरान पर लगा प्रतिबंध हटाएगा। यह बात देश के विदेश मंत्री स्टेफाने डियोन ने कही। संसद में एक सवाल के जवाब में डियोन ने कहा, 'कनाडा प्रतिबंध हटाएगा।' उन्होंने कहा, 'हम इस नीति में बदलाव लाएंगे। प्रतिबंध किसी के लिए अच्छा नहीं है।' इसके लिए कोई समयसीमा नहीं दी गई है। कनाडा ने ईरान और विश्व के प्रमुख देशों के बीच हुई सहमति लागू होने के कुछ हफ्तों के बाद कल यह घोषणा की। इस सहमति से अमेरिका और यूरोपीय संघ द्वारा ईरान के परमाणु कार्यक्रम के संबंध में लगाए गए प्रतिबंध को हटाने का रास्ता साफ हुआ।"
input_ids = tokenizer.encode(article_text, return_tensors='pt')

summary_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
headline = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(f"Generated Headline: {headline}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generated Headline: कनाडा अमेरिका और यूरोपीय संघ क


In [9]:
train_df_downsampled = train_df.sample(n=8000, random_state=42)
val_df_downsampled = val_df.sample(n=1719, random_state=42)
test_df_downsampled = test_df.sample(n=1709, random_state=42)

In [10]:
def preprocess_text(text):
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df_downsampled['Document'] = train_df_downsampled['Document'].apply(preprocess_text)
train_df_downsampled['Title'] = train_df_downsampled['Title'].apply(preprocess_text)


In [11]:
!pip install datasets



In [12]:
train_dataset = Dataset.from_pandas(train_df_downsampled[['Document', 'Title']])
val_dataset = Dataset.from_pandas(val_df_downsampled[['Document', 'Title']])
test_dataset = Dataset.from_pandas(test_df_downsampled[['Document', 'Title']])

In [13]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

def tokenize_function(examples):
    
    model_inputs = tokenizer(examples['Document'], max_length=1024, truncation=True, padding="max_length")
   
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['Title'], max_length=128, truncation=True, padding="max_length")

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
        for labels_example in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['Document', 'Title'])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['Document', 'Title'])
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=['Document', 'Title'])

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1719 [00:00<?, ? examples/s]

Map:   0%|          | 0/1709 [00:00<?, ? examples/s]

In [15]:
!pip install accelerate -U
!pip install transformers[torch] -U




In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8849,0.868556
2,0.8135,0.826455
3,0.7804,0.802646


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=6000, training_loss=0.8674424489339193, metrics={'train_runtime': 4632.5373, 'train_samples_per_second': 5.181, 'train_steps_per_second': 1.295, 'total_flos': 1.463367499776e+16, 'train_loss': 0.8674424489339193, 'epoch': 3.0})

In [17]:
model.save_pretrained('./results/checkpoint-last')
tokenizer.save_pretrained('./results/checkpoint-last')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./results/checkpoint-last/tokenizer_config.json',
 './results/checkpoint-last/special_tokens_map.json',
 './results/checkpoint-last/vocab.json',
 './results/checkpoint-last/merges.txt',
 './results/checkpoint-last/added_tokens.json')

In [18]:
from tqdm.auto import tqdm

# def generate_headline(document_text, max_length=50):
    
#     input_ids = tokenizer(document_text, return_tensors='pt', max_length=1024, truncation=True).input_ids
#     input_ids = input_ids.to(model.device) 

#     summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)

#     headline = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
#     return headline

# generated_headlines = []

# for _, row in tqdm(test_df_downsampled.iterrows(), total=5, desc="Generating Headlines"):
#     document_text = row['Document']
#     generated_headline = generate_headline(document_text)
#     generated_headlines.append(generated_headline)

# test_df_downsampled['Generated_Headline'] = generated_headlines

# test_df_downsampled[['Document', 'Title', 'Generated_Headline']]

model_path = "./results/checkpoint-last"

tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

def generate_headline(text, model, tokenizer):
    
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=50,
        num_beams=4,
        early_stopping=True
    )

    headline = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return headline

test_df['Generated_Headline'] = test_df['Document'].apply(lambda x: generate_headline(x, model, tokenizer))

test_df[['Document', 'Title','Generated_Headline']]

Generating Headlines:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,Document,Title,Generated_Headline
16068,जलवायु परिवर्तन समझौते के पेरिस मसौदे ने भारत ...,पेरिस मसौदे से भारत को निराशा,जलवायु परिवर्तन समझौते में व
261,अगर वादी और प्रतिवादी का रुतबा एक जैसा है तो क...,श्रम कानूनों में संशोधन पर विचार करे नई सरकार,संसाधनों के खिलाफ मुकदमा लड़ र
7538,वैज्ञानिकों ने एचआईवी संक्रमण का पता लगाने का ...,अमेरिका में वैज्ञानिकों ने संक्रमण का पता लगान...,एचआईवी संक्रमण का पता लगाने के
40598,"इंडियन इंस्टीट्यूट ऑफ टेक्नोलॉजी, दिल्ली, आज य...","2020 गेट 2020 एडमिट कार्ड आज होंगे जारी, यहां ...",2020 एडमिट कार्ड में जाएंगे पर करन
21229,भारत ने कैंडी में श्रीलंका के खिलाफ खेले जा रह...,धवन-राहुल की शानदार बल्लेबाजी के बाद श्रीलंका ...,श्रीलंका के खिलाफ खेले 329 रन बना �
...,...,...,...
6807,सीवीसी मौजूदा और अन्य लेखा परीक्षकों की रिपोर्...,"बैंकों, बीमा फर्मों की ऑडिट रिपोर्ट खंगाल रहा ...",धोखाधड़ी का पता लगाने और बीमा कं
27034,सिमरिया थाना क्षेत्र के लोबगा गांव से टीपीस...,"टीपीसी एरिया कमांडर समेत तीन गिरफ्तार, ऑटोमे...",निशांत के बोराशरीफ टेला बरवाड�
44203,नागरिकता संशोधन कानून (सीएए) के समर्थन में भ...,"सीएए के समर्थन में निकला जुलूस, 1 घंटे 6 मिन...",सीएए के समर्थन में मंच का 1 किम�
429,बंबई उच्च न्यायालय ने भारतीय कंपनियों के साथ स...,... विदेशी कंपनियों ने दी अदालत में चुनौती,भारतीय कंपनियों के साथ सौद मे�


## Evaluation Metrics

In [31]:
from pycocoevalcap.cider.cider import Cider

def calculate_cider(pred_dict, true_dict):
    cider_scorer = Cider()
    score, _ = cider_scorer.compute_score(true_dict, pred_dict)
    return score

true_dict = {i: [line] for i, line in enumerate(true_headlines)}
pred_dict = {i: [line] for i, line in enumerate(pred_headlines)}

cider_score = calculate_cider(pred_dict, true_dict)
print("CIDEr Score:", cider_score*100)

CIDEr Score: 29.921713688621253


In [24]:
pip install pynlpl

Collecting pynlpl
  Downloading PyNLPl-1.2.9.tar.gz (277 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.9/277.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdflib (from pynlpl)
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Collecting isodate<0.7.0,>=0.6.0 (from rdflib->pynlpl)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pynlpl
  Building wheel for pynlpl (setup.py) ... [?25l[?25hdone
  Created wheel for pynlpl: filename=PyNLPl-1.2.9-py3-none-any.whl size=328342 sha256=0b3f7852209e92945e2c32f9716949ba712de39598deeffe1ef418fd756e6955
  Stored in directory: /root/.cache/pip/wheels/f2/2f/ab/d2bb

In [25]:
from sacrebleu.metrics import CHRF

def calculate_chrf(predictions, references):
    chrf = CHRF(word_order=2)  
    scores = chrf.corpus_score(predictions, [references])
    return scores.score

chrf_score = calculate_chrf(pred_headlines, true_headlines)
print("chrF Score:", chrf_score)

chrF Score: 13.10995182108861


In [26]:
pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


In [27]:
from bert_score import score

def calculate_bertscore(predictions, references):
    P, R, F1 = score(predictions, references, lang="en", verbose=True)
    return F1.mean().item()

bertscore = calculate_bertscore(pred_headlines, true_headlines)
print("BERTScore F1:", bertscore)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/54 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/27 [00:00<?, ?it/s]

done in 50.93 seconds, 33.55 sentences/sec
BERTScore F1: 0.9021744132041931
