# Multilingual Text Summarization Service

In [None]:
%pip install datasets
%pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
import re

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration, T5Config

In [None]:
# Load the CNN/DailyMail dataset
dataset = load_dataset("abisee/cnn_dailymail", "2.0.0")

# Sample data from the dataset (checking a few samples from the training set)
for i in range(3):
    print(f"Sample {i+1}")
    print("Article: ", dataset['train'][i]['article'])
    print("Highlights: ", dataset['train'][i]['highlights'])
    print("\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Sample 1
Article:  LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office ch

## Data Pre-processing

The list of contractions is from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

In [None]:
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
def preprocess_text(text):
    '''Remove unwanted characters, replace contractions, and clean up the text for better model performance'''

    text = text.lower()

    # Replace contractions with their full form
    text = text.split()
    new_text = []
    for word in text:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)

    text = re.sub(r"['’]", ' ', text)

    # Remove URLs
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

    return text

In [None]:
clean_originals = []
for article in dataset['train']['article']:
    clean_originals.append(preprocess_text(article))
print("Articles are cleaned successfully.")

clean_highlights = []
for summary in dataset['train']['highlights']:
    clean_highlights.append(preprocess_text(summary))

data = pd.DataFrame({'article': clean_originals, 'highlights': clean_highlights})
data['article'] = data['article'].fillna("no article available")
data['highlights'] = data['highlights'].fillna("no highlights available")

clean_originals, clean_highlights = data['article'].tolist(), data['highlights'].tolist()

print("Highlights are cleaned successfully.")

Articles are cleaned successfully.
Highlights are cleaned successfully.


In [None]:
for i in range(3):
    print("Clean Review Number",i+1)
    print(clean_originals[i])
    print(clean_highlights[i])
    print()

Clean Review Number 1
london, england (reuters) -- harry potter star daniel radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on monday, but he insists the money will not cast a spell on him. daniel radcliffe as harry potter in "harry potter and the order of the phoenix" to the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "i do not plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an australian interviewer earlier this month. "i do not think i will be particularly extravagant. "the things i like buying are things that cost about 10 pounds -- books and cds and dvds." at 18, radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "hostel: part ii," currently six places below his number one movie on the uk box

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/Multilingual Text Summarization Service/cleaned_dataset.csv'
data.to_csv(file_path, index=False)

print(f"Cleaned dataset saved to {file_path}.")

Cleaned dataset saved to /content/drive/My Drive/Multilingual Text Summarization Service/cleaned_dataset.csv.
