## Import the libraries

In [1]:
import pandas as pd
import requests
from datetime import datetime
import time
import json

## Setup the API key and the parameters

In [10]:
api_key = '1e224815-7d97-4aeb-b4c9-bf2c8bca1c73'

In [11]:
api_url = 'https://content.guardianapis.com/search'

In [12]:
search_query = '"emmanuel macron"'
order = 'oldest'
page_size = 100
from_date = '2023-01-01'

In [13]:
# define the parameters
params = {
    'q': search_query,
    'from-date': from_date,
    'order-by': order,
    'page-size': page_size,
    'api-key': api_key
}

## Send a request

In [14]:
response = requests.get(api_url, params=params)

In [15]:
response

<Response [200]>

In [16]:
if response.status_code == 200:
  print("Request successful! Status code: 200")
  data = response.json()
else:
  print("Error:", response.status_code)

Request successful! Status code: 200


In [17]:
data

{'response': {'status': 'ok',
  'userTier': 'developer',
  'total': 1025,
  'startIndex': 1,
  'pageSize': 100,
  'currentPage': 1,
  'pages': 11,
  'orderBy': 'oldest',
  'results': [{'id': 'lifeandstyle/2023/jan/01/sport-tv-tech-and-fashion-what-does-2023-have-in-store-for-us',
    'type': 'article',
    'sectionId': 'lifeandstyle',
    'sectionName': 'Life and style',
    'webPublicationDate': '2023-01-01T12:00:46Z',
    'webTitle': 'Sport, TV, tech and fashion: what does 2023 have in store for us?',
    'webUrl': 'https://www.theguardian.com/lifeandstyle/2023/jan/01/sport-tv-tech-and-fashion-what-does-2023-have-in-store-for-us',
    'apiUrl': 'https://content.guardianapis.com/lifeandstyle/2023/jan/01/sport-tv-tech-and-fashion-what-does-2023-have-in-store-for-us',
    'isHosted': False,
    'pillarId': 'pillar/lifestyle',
    'pillarName': 'Lifestyle'},
   {'id': 'world/2023/jan/01/1-million-people-gather-on-champs-elysees-in-paris-to-see-in-new-year',
    'type': 'article',
    'se

In [18]:
# first result = first item
data['response']['results'][0]

{'id': 'lifeandstyle/2023/jan/01/sport-tv-tech-and-fashion-what-does-2023-have-in-store-for-us',
 'type': 'article',
 'sectionId': 'lifeandstyle',
 'sectionName': 'Life and style',
 'webPublicationDate': '2023-01-01T12:00:46Z',
 'webTitle': 'Sport, TV, tech and fashion: what does 2023 have in store for us?',
 'webUrl': 'https://www.theguardian.com/lifeandstyle/2023/jan/01/sport-tv-tech-and-fashion-what-does-2023-have-in-store-for-us',
 'apiUrl': 'https://content.guardianapis.com/lifeandstyle/2023/jan/01/sport-tv-tech-and-fashion-what-does-2023-have-in-store-for-us',
 'isHosted': False,
 'pillarId': 'pillar/lifestyle',
 'pillarName': 'Lifestyle'}

## Collect all data between certain dates

In [19]:
delay = 2 # seconds
dataset = []
number_of_results = page_size

In [20]:
while number_of_results == page_size:
  time.sleep(delay ** 2) # to prevent overwhelming the server

  # update the parameters
  params = {
    'q': search_query,
    'from-date': from_date,
    'order-by': order,
    'page-size': page_size,
    'api-key': api_key
  }

  # send a request and get a response
  response = requests.get(api_url, params=params)

  # add each response's data to existing data
  if response.status_code == 200:
    data = response.json()
    data_ = data['response']['results'] # type -> list
    dataset += data_
    print("Request successful! Status code: 200")
  else:
    print("Error: ", response.status_code)

  # extract date and update the from date for the next request
  from_date= datetime.fromisoformat(data['response']['results'][-1]['webPublicationDate'][:-1])
  from_date = from_date.strftime("%Y-%m-%d")

  # update the number of results to break the loop
  number_of_results = len(data['response']['results'])

Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200
Request successful! Status code: 200


In [21]:
print("Total number of results in the dataset:", len(dataset))

Total number of results in the dataset: 1042


## Create a dataframe based on the collected data

In [22]:
df = pd.DataFrame(dataset, columns=["type", "sectionName", "webPublicationDate", "webTitle", "webUrl"])

In [23]:
df.head()

Unnamed: 0,type,sectionName,webPublicationDate,webTitle,webUrl
0,article,Life and style,2023-01-01T12:00:46Z,"Sport, TV, tech and fashion: what does 2023 ha...",https://www.theguardian.com/lifeandstyle/2023/...
1,article,World news,2023-01-01T14:10:49Z,1m people gather on Champs-Élysées in Paris to...,https://www.theguardian.com/world/2023/jan/01/...
2,article,World news,2023-01-02T14:00:17Z,‘Not all traditions are good’: lethal accident...,https://www.theguardian.com/world/2023/jan/02/...
3,liveblog,World news,2023-01-03T19:00:10Z,Zelenskiy warns of Russian drone campaign as U...,https://www.theguardian.com/world/live/2023/ja...
4,article,World news,2023-01-04T16:18:01Z,Moscow blames its troops’ use of mobile phones...,https://www.theguardian.com/world/2023/jan/04/...


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042 entries, 0 to 1041
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   type                1042 non-null   object
 1   sectionName         1042 non-null   object
 2   webPublicationDate  1042 non-null   object
 3   webTitle            1042 non-null   object
 4   webUrl              1042 non-null   object
dtypes: object(5)
memory usage: 40.8+ KB


#### Type distribution

In [25]:
df.type.value_counts()

type
article     781
liveblog    261
Name: count, dtype: int64

#### Section distribution

In [26]:
df.sectionName.value_counts()

sectionName
World news            654
Opinion               108
Politics               53
Australia news         40
Environment            33
UK news                31
US news                28
Business               19
Sport                  16
Technology              9
Books                   7
News                    6
Film                    6
Culture                 4
Football                4
Fashion                 3
Travel                  3
Society                 3
Television & radio      3
Global development      2
Media                   2
Music                   2
Games                   1
Life and style          1
Weather                 1
Food                    1
Money                   1
Stage                   1
Name: count, dtype: int64

### Convert the date into YYYY MM DD

In [27]:
def convert_date(value):
  date = datetime.fromisoformat(value[:-1])
  return date.strftime("%Y-%m-%d")

In [28]:
df['date'] = df.webPublicationDate.apply(convert_date)

In [29]:
df.head(100)

Unnamed: 0,type,sectionName,webPublicationDate,webTitle,webUrl,date
0,article,Life and style,2023-01-01T12:00:46Z,"Sport, TV, tech and fashion: what does 2023 ha...",https://www.theguardian.com/lifeandstyle/2023/...,2023-01-01
1,article,World news,2023-01-01T14:10:49Z,1m people gather on Champs-Élysées in Paris to...,https://www.theguardian.com/world/2023/jan/01/...,2023-01-01
2,article,World news,2023-01-02T14:00:17Z,‘Not all traditions are good’: lethal accident...,https://www.theguardian.com/world/2023/jan/02/...,2023-01-02
3,liveblog,World news,2023-01-03T19:00:10Z,Zelenskiy warns of Russian drone campaign as U...,https://www.theguardian.com/world/live/2023/ja...,2023-01-03
4,article,World news,2023-01-04T16:18:01Z,Moscow blames its troops’ use of mobile phones...,https://www.theguardian.com/world/2023/jan/04/...,2023-01-04
...,...,...,...,...,...,...
95,article,World news,2023-02-20T15:14:40Z,Russia-Ukraine war at a glance: what we know o...,https://www.theguardian.com/world/2023/feb/20/...,2023-02-20
96,liveblog,World news,2023-02-20T21:15:43Z,Joe Biden’s surprise visit to Kyiv ‘unpreceden...,https://www.theguardian.com/world/live/2023/fe...,2023-02-20
97,article,US news,2023-02-20T22:34:24Z,US informed Russia of Joe Biden’s Kyiv visit h...,https://www.theguardian.com/us-news/2023/feb/2...,2023-02-20
98,article,Opinion,2023-02-22T18:01:16Z,‘Just give Ukraine the planes’ is the battle c...,https://www.theguardian.com/commentisfree/2023...,2023-02-22


In [30]:
df.iloc[0]['webTitle']

'Sport, TV, tech and fashion: what does 2023 have in store for us?'