<a href="https://colab.research.google.com/github/crockrocks/text-summarization/blob/main/GetToThePoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Getting the dataset from kaggle
! pip install kaggle



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! mkdir ~/.kaggle
! cp ./kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail

Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
 99% 497M/503M [00:05<00:00, 148MB/s]
100% 503M/503M [00:05<00:00, 103MB/s]


In [None]:
! unzip newspaper-text-summarization-cnn-dailymail

Archive:  newspaper-text-summarization-cnn-dailymail.zip
  inflating: cnn_dailymail/test.csv  
  inflating: cnn_dailymail/train.csv  
  inflating: cnn_dailymail/validation.csv  


# Using TF-IDF

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import textwrap
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def load_dataset(dataset_path):
    df = pd.read_csv(dataset_path)
    # Drop id column
    df.drop(columns=['id'],inplace=True)
    # Drop null values (if any)
    df.dropna(inplace=True)
    print("Number of records:",len(df))
    return df

In [None]:
df_train = load_dataset(r'./cnn_dailymail/train.csv')
df_train.head()

Number of records: 287113


Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
def wrap(x):
    return textwrap.fill(x,replace_whitespace=False,fix_sentence_endings=True)

print(wrap(df_train['article'][1]))

(CNN) -- Ralph Mata was an internal affairs lieutenant for the Miami-
Dade Police Department, working in the division that investigates
allegations of wrongdoing by cops.  Outside the office, authorities
allege that the 45-year-old longtime officer worked with a drug
trafficking organization to help plan a murder plot and get guns.  A
criminal complaint unsealed in U.S. District Court in New Jersey
Tuesday accuses Mata, also known as "The Milk Man," of using his role
as a police officer to help the drug trafficking organization in
exchange for money and gifts, including a Rolex watch.  In one
instance, the complaint alleges, Mata arranged to pay two assassins to
kill rival drug dealers.  The killers would pose as cops, pulling over
their targets before shooting them, according to the complaint.
"Ultimately, the (organization) decided not to move forward with the
murder plot, but Mata still received a payment for setting up the
meetings," federal prosecutors said in a statement.  The co

In [None]:
stop_words = stopwords.words("english")
tfidf = TfidfVectorizer(stop_words=stop_words,norm='l1')

def get_sentence_score(tfidf_row):
    x = tfidf_row[tfidf_row != 0]
    return x.mean()

def summarize(text):
    # Extract sentences
    sents = sent_tokenize(text)
    X = tfidf.fit_transform(sents)
    # computing scores
    scores = np.zeros(len(sents))
    for i in range(len(sents)):
        score = get_sentence_score(X[i,:])
        scores[i] = score

    # sort the scores
    sort_idx = np.argsort(-scores)

    # print summary
    print("Summary:")
    for i in sort_idx[:5]:
        print(wrap("%2f: %s"%(scores[i],sents[i])))

In [None]:
summarize(df_train['article'][1])

Summary:
0.200000: CNN's Suzanne Presto contributed to this report.
0.166667: He is scheduled to appear in federal court in Florida on
Wednesday.
0.166667: If convicted, Mata could face life in prison.
0.142857: Since March 2010, he had been working in the internal
affairs division.
0.142857: Authorities arrested Mata on Tuesday in Miami Gardens,
Florida.


# Using hugging face transformers .

In [None]:
# Using transformers from hugging face
! pip install transformers
! pip install datasets
! pip install rouge_score

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.6 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import pipeline
from datasets import load_metric
from rouge_score import rouge_scorer

In [None]:
summarizer = pipeline("summarization" , model='facebook/bart-large-cnn')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Model Details : https://huggingface.co/facebook/bart-large-cnn

In [None]:
def summary_trf(num):
    original_article = df_train['article'][num]
    print('Original Article\n')
    print(wrap(original_article))

    generated_summary = summarizer(original_article)[0]['summary_text']

    print('\nGenerated Summary\n')
    print(wrap(generated_summary))

    # Calculate ROUGE scores using rouge_score
    references = [df_train['highlights'][num]]
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(references[0], generated_summary)

    print('\nROUGE Scores')
    print(f"ROUGE-1: {rouge_scores['rouge1'].fmeasure:.4f}")
    print(f"ROUGE-2: {rouge_scores['rouge2'].fmeasure:.4f}")
    print(f"ROUGE-L: {rouge_scores['rougeL'].fmeasure:.4f}")

    return generated_summary

summary_trf(1)

Original Article

(CNN) -- Ralph Mata was an internal affairs lieutenant for the Miami-
Dade Police Department, working in the division that investigates
allegations of wrongdoing by cops.  Outside the office, authorities
allege that the 45-year-old longtime officer worked with a drug
trafficking organization to help plan a murder plot and get guns.  A
criminal complaint unsealed in U.S. District Court in New Jersey
Tuesday accuses Mata, also known as "The Milk Man," of using his role
as a police officer to help the drug trafficking organization in
exchange for money and gifts, including a Rolex watch.  In one
instance, the complaint alleges, Mata arranged to pay two assassins to
kill rival drug dealers.  The killers would pose as cops, pulling over
their targets before shooting them, according to the complaint.
"Ultimately, the (organization) decided not to move forward with the
murder plot, but Mata still received a payment for setting up the
meetings," federal prosecutors said in a 

'Ralph Mata, 45, was an internal affairs lieutenant for the Miami-Dade Police Department. Authorities allege he worked with a drug trafficking organization to help plan a murder plot. The complaint also alleges that Mata used his police badge to purchase weapons for drug traffickers. Mata faces charges of aiding and abetting a conspiracy to distribute cocaine.'

In [None]:
df_test = pd.read_csv('./cnn_dailymail/test.csv')

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
from datasets import load_metric
from rouge_score import rouge_scorer

# Load BART model and tokenizer
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

#custom summarization pipeline
def custom_summarization(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return generated_summary

# Load ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to generate and evaluate summary
def evaluate_summary(num):
    original_article = df_test['article'][num]
    print('Original Article\n')
    print(wrap(original_article))

    generated_summary = custom_summarization(original_article)

    print('\nGenerated Summary\n')
    print(wrap(generated_summary))

    references = [df_test['highlights'][num]]
    rouge_scores = scorer.score(references[0], generated_summary)

    print('\nROUGE Scores')
    print(f"ROUGE-1: {rouge_scores['rouge1'].fmeasure:.4f}")
    print(f"ROUGE-2: {rouge_scores['rouge2'].fmeasure:.4f}")
    print(f"ROUGE-L: {rouge_scores['rougeL'].fmeasure:.4f}")

    return generated_summary

# Loop through the first 10 elements in df_test
for i in range(min(10, len(df_test))):
    print(f"\n\nEvaluating article {i+1}")
    try:
        generated_summary = evaluate_summary(i)
    except IndexError:
        print(f"Error: Index out of range for article {i+1}")



Evaluating article 1
Original Article

Ever noticed how plane seats appear to be getting smaller and smaller?
With increasing numbers of people taking to the skies, some experts
are questioning if having such packed out planes is putting passengers
at risk.  They say that the shrinking space on aeroplanes is not only
uncomfortable - it's putting our health and safety in danger.  More
than squabbling over the arm rest, shrinking space on planes putting
our health and safety in danger?  This week, a U.S consumer advisory
group set up by the Department of Transportation said at a public
hearing that while the government is happy to set standards for
animals flying on planes, it doesn't stipulate a minimum amount of
space for humans.  'In a world where animals have more rights to space
and food than humans,' said Charlie Leocha, consumer representative on
the committee. 'It is time that the DOT and FAA take a stand for
humane treatment of passengers.'  But could crowding on planes lead t

Link for the above model it is deployed on hugging face space : https://huggingface.co/spaces/crockrocks/text-summarization