# Global Article Summarization and Link Analysis
### Loading the required modules

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

### Preparing the dataset

In [3]:
data = pd.read_csv("./data.csv")
data.head(3)

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
0,89541,,International Business Times,Paavan MATHEMA,UN Chief Urges World To 'Stop The Madness' Of ...,UN Secretary-General Antonio Guterres urged th...,https://www.ibtimes.com/un-chief-urges-world-s...,https://d.ibtimes.com/en/full/4496078/nepals-g...,2023-10-30 10:12:35.000000,UN Secretary-General Antonio Guterres urged th...,Nepal,UN Secretary-General Antonio Guterres urged th...
1,89542,,Prtimes.jp,,RANDEBOOよりワンランク上の大人っぽさが漂うニットとベストが新登場。,[株式会社Ainer]\nRANDEBOO（ランデブー）では2023年7月18日(火)より公...,https://prtimes.jp/main/html/rd/p/000000147.00...,https://prtimes.jp/i/32220/147/ogp/d32220-147-...,2023-10-06 04:40:02.000000,"RANDEBOO2023718()WEB2023 Autumn Winter \n""Nepa...",Nepal,
2,89543,,VOA News,webdesk@voanews.com (Agence France-Presse),UN Chief Urges World to 'Stop the Madness' of ...,UN Secretary-General Antonio Guterres urged th...,https://www.voanews.com/a/un-chief-urges-world...,https://gdb.voanews.com/01000000-0a00-0242-60f...,2023-10-30 10:53:30.000000,"Kathmandu, Nepal UN Secretary-General Antonio...",Nepal,


In [4]:
data.tail(3)

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
105372,781235,,NPR,Brigid McCarthy,Three hospitals ignored her gravely ill fiancé...,"Forty years ago, Sarah Lubarsky came home from...",https://www.npr.org/2023/11/29/1215016001/heal...,https://media.npr.org/assets/img/2023/11/23/sa...,2023-11-29 10:01:12,The photo from David and Sarah Lubarsky's wedd...,Home,The photo from David and Sarah Lubarsky's wedd...
105373,781240,,Forbes,"Gary Stern, Contributor, \n Gary Stern, Contri...",Kerber’s Farm: Bringing Farm To Table To Manha...,"A farmstand in Long Island, Kerber’s Farms has...",https://www.forbes.com/sites/garystern/2023/11...,https://imageio.forbes.com/specials-images/ima...,2023-11-29 13:44:33,Kerbers Farm: Bringing Farm To Table To Manhat...,Home,Kerber’s Farm: Bringing Farm To Table To Manha...
105374,781308,,Forbes,"Johan Hajji, Forbes Councils Member, \n Johan ...",Tips For Investing In Short-Term Rentals In Dubai,By exploring your options and keeping a few be...,https://www.forbes.com/sites/forbesbusinesscou...,https://imageio.forbes.com/specials-images/ima...,2023-11-29 14:00:00,Cofounder at UpperKey. Passionate about proper...,Home,Cofounder at UpperKey. Passionate about proper...


In [11]:
data.describe()

Unnamed: 0,article_id
count,105375.0
mean,195044.072987
std,199819.981007
min,363.0
25%,50405.5
50%,108862.0
75%,284507.5
max,781308.0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105375 entries, 0 to 105374
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   article_id    105375 non-null  int64 
 1   source_id     24495 non-null   object
 2   source_name   105375 non-null  object
 3   author        97156 non-null   object
 4   title         105375 non-null  object
 5   description   104992 non-null  object
 6   url           105375 non-null  object
 7   url_to_image  99751 non-null   object
 8   published_at  105375 non-null  object
 9   content       105375 non-null  object
 10  category      105333 non-null  object
 11  full_content  58432 non-null   object
dtypes: int64(1), object(11)
memory usage: 9.6+ MB


In [4]:
rating_data = pd.read_csv("./rating.csv")
rating_data.head(3)

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,article,title_sentiment
0,81664,,Forbes,"Elizabeth Brownfield, Contributor, \n Elizabet...",superstar chef yannick alléno brings refined f...,Now open in Mayfair at Four Seasons Hotel Lond...,https://www.forbes.com/sites/elizabethbrownfie...,https://imageio.forbes.com/specials-images/ima...,2023-11-01 03:27:21.000000,"Pavyllon London, at Four Seasons Hotel London ...",Monaco,"pavyllon london, at four seasons hotel london ...",Neutral
1,81667,,CNA,,nice claim top spot in ligue 1 with late win a...,Nice moved into provisional first place in the...,https://www.channelnewsasia.com/sport/nice-cla...,https://onecms-res.cloudinary.com/image/upload...,2023-10-27 21:28:48.000000,Nice moved into provisional first place in the...,Monaco,nice moved into provisional first place in the...,Positive
2,81694,time,Time,Christina Larson / AP,amphibians are the world’s most vulnerable spe...,"The world’s frogs, salamanders, newts, and oth...",https://time.com/6320467/amphibians-most-vulne...,https://api.time.com/wp-content/uploads/2023/1...,2023-10-04 17:36:18.000000,"The worlds frogs, salamanders, newts and other...",Madagascar,"the world’s frogs, salamanders, newts and othe...",Negative


### Handling Null Values

In [5]:
null_counts = data.isnull().sum()
null_counts

article_id          0
source_id       80880
source_name         0
author           8219
title              40
description       383
url                 0
url_to_image     5624
published_at        0
content             0
category           42
full_content    46943
dtype: int64

In [6]:
data[data['title'].isnull()].head(2)

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
1139,91420,,kottke.org,Jason Kottke,,Is food in America better or worse than in oth...,https://kottke.org/23/10/0043175-is-food-in-am...,,2023-10-06 22:16:33.000000,Is food in America better or worse than in oth...,Peru,
16575,8362,,Thegospelcoalition.org,Scotty Smith,,“He (the Messiah—Jesus) shall judge between th...,https://www.thegospelcoalition.org/blogs/scott...,https://media.thegospelcoalition.org/wp-conten...,2023-10-08 11:28:57.000000,He (the MessiahJesus) shall judge between the ...,Somalia,


In [7]:
data['title'] = data['title'].fillna(data['content'])
data['title'].isnull().sum()

0

In [8]:
data[data['full_content'].isnull()].head(2)

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
1,89542,,Prtimes.jp,,RANDEBOOよりワンランク上の大人っぽさが漂うニットとベストが新登場。,[株式会社Ainer]\nRANDEBOO（ランデブー）では2023年7月18日(火)より公...,https://prtimes.jp/main/html/rd/p/000000147.00...,https://prtimes.jp/i/32220/147/ogp/d32220-147-...,2023-10-06 04:40:02.000000,"RANDEBOO2023718()WEB2023 Autumn Winter \n""Nepa...",Nepal,
2,89543,,VOA News,webdesk@voanews.com (Agence France-Presse),UN Chief Urges World to 'Stop the Madness' of ...,UN Secretary-General Antonio Guterres urged th...,https://www.voanews.com/a/un-chief-urges-world...,https://gdb.voanews.com/01000000-0a00-0242-60f...,2023-10-30 10:53:30.000000,"Kathmandu, Nepal UN Secretary-General Antonio...",Nepal,


In [9]:
data['full_content'] = data['full_content'].fillna(data['content'])
data['full_content'].isnull().sum()

0

## Information Retrieval
Return the most relevant documents from the natural query using cosine similarity

In [10]:
%%time
def search_articles(query, data, topn=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data['full_content'])
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    article_indices = cosine_similarities.argsort()[::-1]
    relevant_articles = data.iloc[article_indices][:topn]
    return relevant_articles

query = "Israel hamas war"
results = search_articles(query, data, 10)

Wall time: 29.1 s


In [12]:
results.head(3)

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
4922,965,bbc-news,BBC News,https://www.facebook.com/bbcnews,Israel-Gaza: We answer your questions,Our correspondents answer your questions on th...,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-10-16 20:24:36.000000,The world is still reeling from the unpreceden...,Qatar,The world is still reeling from the unpreceden...
81005,335867,abc-news,ABC News,FATIMA HUSSEIN Associated Press,A third round of US sanctions against Hamas fo...,The United States says it has imposed a third ...,https://abcnews.go.com/Business/wireStory/roun...,https://i.abcnewsfe.com/a/c4e6de43-1581-476f-b...,2023-11-14 13:50:35,WASHINGTON -- The United States on Tuesday sai...,Finance,WASHINGTON --The United States on Tuesday said...
46694,72866,cnn,CNN,"Abbas Al Lawati, Nadeen Ebrahim",Israel is at war with Hamas. Here's what to know,Israel has declared war on the Palestinian mil...,https://www.cnn.com/2023/10/09/middleeast/isra...,https://media.cnn.com/api/v1/images/stellar/pr...,2023-10-09 16:46:21.000000,Editors Note: A version of this story appears ...,"Iran, Islamic Republic of",Editor’s Note:A version of this story appears ...


In [30]:
title = results.iloc[0].title
content = results.iloc[0].full_content
url = results.iloc[0].url
print(title, content, url, sep="\n")

Israel-Gaza: We answer your questions
https://www.bbc.co.uk/news/world-middle-east-67128533


**TODO**: Sort relevant articles by date, optimize performance

### Keyword Extraction

In [32]:
%%time
def extract_keywords(document, topn=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([document])
    feature_names = vectorizer.get_feature_names_out()
    word_scores = [(feature_names[i], tfidf_matrix[0, i]) for i in range(tfidf_matrix.shape[1])]
    word_scores_sorted = sorted(word_scores, key=lambda x: x[1], reverse=True)
    top_keywords = [word for word, score in word_scores_sorted[:topn]]
    return top_keywords

keywords = extract_keywords(content, 5)
keywords

Wall time: 18.5 ms


['israel', 'hamas', 'gaza', 'military', 'war']

## Text Summarization
**TODO**: Summarize blocks of 512 words

In [33]:
%%time
summarizer = pipeline("summarization", model="./text_summarization")
res = summarizer(content, max_length=1000, min_length=30, do_sample=False)
summary = res[0]['summary_text']
summary

Token indices sequence length is longer than the specified maximum sequence length for this model (2033 > 512). Running this sequence through the model will result in indexing errors


Wall time: 12.8 s


'We have received hundreds of your questions about the conflict, its impact and where it may end . Our correspondents, many currently in the region, have answered some of the most frequently asked questions . If Iran gets directly involved, would that prompt the US to join the war? And could this lead to a third world war?'

In [35]:
len(content), len(summary)

(8993, 323)

In [36]:
sentences = content.split(". ")
len(sentences)

65

In [39]:
%%time
out = [summarizer(". ".join(sentences[i*30:i*30+30]), max_length=500, min_length=30, do_sample=False)[0]['summary_text'] for i in range(2)]
"".join(out)

Wall time: 14.5 s


"We have received hundreds of your questions about the conflict, its impact and where it may end . Our correspondents, many currently in the region, have answered some of the most frequently asked questions below . Jeremy Bowen in Skelmersdale, UK asks: Would that prompt the US and its allies to join the war? The Americans just deployed two aircraft carrier battle groups to the eastern Mediterranean .Israel has been the junior partner in earlier versions of Israel's siege of Gaza since 2007 . The Brotherhood wants to reshape states and society in line with Islamic teaching and belief . Egypt's current regime has relations with Hamas, but it does not want an influx of Palestinian refugees . Palestinians say that is because Israeli strikes are excessive and indiscriminate ."