In [1]:
import pandas as pd
from newscatcherapi import NewsCatcherApiClient
from credentials import api_key


In [2]:
newsapi = NewsCatcherApiClient(x_api_key = api_key)

In [5]:
sources = ['apnews.com', 'reuters.com', 'bbc.com',
           'nytimes.com', 'theguardian.com', 
           'cnn.com', 'washingtonpost.com', 'wsj.com',
           'foxnews.com', 'breitbart.com', 'newsmax.com',
           'rt.com', 'sputniknews.com',
           'aljazeera.com', 'indiatimes.com', 'chinadaily.com.cn']

query = " (climate && change) || (climate && crisis) "

news = newsapi.get_search_all_articles(q = query, sort_by = 'date',
                          lang='en', by = 'month', sources = sources,
                          from_ = '2020/05/28', to_ = '2023/05/27',
                          page_size = 100, max_page = 100, seconds_pause = 1)

05/28/2020 00:00:00 --> 05/27/2023 00:00:00
1 page is going to be extracted
Total number of found articles => 891.
Total number of pages 9.
2/9 page is going to be extracted
3/9 page is going to be extracted
4/9 page is going to be extracted
5/9 page is going to be extracted
6/9 page is going to be extracted
7/9 page is going to be extracted
8/9 page is going to be extracted
9/9 page is going to be extracted
06/25/2020 00:00:00 --> 05/27/2023 00:00:00
1 page is going to be extracted
Total number of found articles => 1056.
Total number of pages 11.
2/11 page is going to be extracted
3/11 page is going to be extracted
4/11 page is going to be extracted
5/11 page is going to be extracted
6/11 page is going to be extracted
7/11 page is going to be extracted
8/11 page is going to be extracted
9/11 page is going to be extracted
10/11 page is going to be extracted
11/11 page is going to be extracted
07/23/2020 00:00:00 --> 05/27/2023 00:00:00
1 page is going to be extracted
Total number of fo

In [137]:
len(news['articles'])

121084

In [138]:
df_news = pd.DataFrame(news['articles'])
df_news.shape

(121084, 19)

In [139]:
df_news['published_date'] = pd.to_datetime(df_news['published_date'])
df_news = df_news.sort_values('published_date')

In [140]:
# Data cleaning 
# Removing duplicates, irrelevant articles
# and rows without summaries

df_news.drop_duplicates(subset = ['summary'], inplace = True)
df_news = df_news.query(" _score > 8 ")

mask = df_news['summary'].notna()
df_news = df_news[mask]

df_news.shape

(63397, 19)

In [141]:
df_news.to_csv('../data/climate_news_3y.tar.bz2', encoding='utf-8', index=False, sep=',')

In [151]:
df_news[['title', '_score']].sample(10).sort_values('_score')

Unnamed: 0,title,_score
58979,"Trees Can Help Cool Down Cities By Up To 12 Degree Celsius, Study Claims",8.218219
45201,India asks international community to unite against terrorism as seriously as it does on issues like climate change and pandemics,8.999563
73789,India's heatwave exposes divide in access to cooling equipment,9.73135
36058,Arctic's ‘Last Ice Area' May Be Less Resistant to Global Warming,10.237351
64465,Budget 2022: Experts hail proposal to issue green bonds for renewable energy,10.675702
12744,Norwegian Billionaire Predicts 'Historic Wave' of Bankruptcies in 2021,13.012243
33362,Call for G7 Cornwall summit to forge global plastic pollution treaty,13.143391
13584,Transcript: The Future Reset: Powering Equitable Opportunity,16.412008
116233,New India-born World Bank Chief: Real Change or Rebranding?,22.222723
24748,Ignore the rhetoric: the UK government still fails to grasp the climate crisis,22.941818


In [147]:
df_news.clean_url.unique()

array(['indiatimes.com', 'theguardian.com', 'nytimes.com', 'cnn.com',
       'bbc.com', 'wsj.com', 'apnews.com', 'reuters.com',
       'washingtonpost.com', 'breitbart.com', 'chinadaily.com.cn',
       'aljazeera.com', 'sputniknews.com', 'foxnews.com', 'rt.com',
       'newsmax.com'], dtype=object)

In [152]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63397 entries, 888 to 120855
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   title                     63397 non-null  object        
 1   author                    53033 non-null  object        
 2   published_date            63397 non-null  datetime64[ns]
 3   published_date_precision  63353 non-null  object        
 4   link                      63397 non-null  object        
 5   clean_url                 63397 non-null  object        
 6   excerpt                   62309 non-null  object        
 7   summary                   63397 non-null  object        
 8   rights                    62650 non-null  object        
 9   rank                      63397 non-null  int64         
 10  topic                     63397 non-null  object        
 11  country                   63397 non-null  object        
 12  language            