# Creating a dataframe of news articles

Step 1. Collecting URLs

*Importing libraries*

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import random
import time

*Scrape URL function*

In [2]:
# Scrape URLs by any search term
def scrape_urls_word(keywords, num_pages):
    base_url = 'https://timesofmalta.com'
    urls = []

    for keyword in keywords:
        for page in range(1, num_pages + 1):
            search_url = f'{base_url}/search?keywords={keyword}&author=0&tags=0&sort=date&order=desc&fields%5B0%5D=title&fields%5B1%5D=body&page={page}'

            response = requests.get(search_url)
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find the script tag with id 'listing-ld'
            script_tag = soup.find('script', {'id': 'listing-ld'})

            if script_tag:
                # Extract the JSON data from the script tag
                json_data = script_tag.string

                # Parse the JSON data
                data = json.loads(json_data)

                # Extract the URLs from the 'url' key in the JSON data
                if isinstance(data, dict) and '@graph' in data:
                    for item in data['@graph']:
                        if '@type' in item and item['@type'] == 'NewsArticle' and 'url' in item:
                            url = item['url']
                            urls.append((url, keyword))

    return urls

*Applying function to 2 keywords*

In [5]:
keywords = ['murder', 'national']
num_pages = 1

urls = scrape_urls_word(keywords, num_pages)
df = pd.DataFrame(urls, columns=['URL', 'Keyword'])

In [6]:
df.head()

Unnamed: 0,URL,Keyword
0,https://timesofmalta.com/articles/view/libel-j...,murder
1,https://timesofmalta.com/articles/view/daughte...,murder
2,https://timesofmalta.com/articles/view/executi...,murder
3,https://timesofmalta.com/articles/view/two-acc...,murder
4,https://timesofmalta.com/articles/view/robert-...,murder


In [8]:
df.shape

(40, 2)

Step 2. Scraping URLs for information such as title, date, text body etc...

*Scraping content of URLs function*

In [9]:
def scrape_webpage(url):
    try:
        r = requests.get(url, timeout=5)
        r.raise_for_status()
    except (requests.exceptions.RequestException, requests.exceptions.HTTPError) as err:
        print(f"An error occurred: {err}")
        return None

    html_content = r.content
    soup = BeautifulSoup(html_content, 'lxml')
    script = soup.find('script', {'id': 'article-ld'})

    if script is None:
        return None

    try:
        data = json.loads(script.string)
    except json.JSONDecodeError as err:
        print(f"An error occurred: {err}")
        return None

    # Extract data from JSON content
    try:
        df_page = pd.DataFrame({
            'Title': [article['name'] for article in data['@graph']],
            'Description': [article['description'] for article in data['@graph']],
            'URL': [article['url'] for article in data['@graph']],
            'Body': [article['articleBody'] for article in data['@graph']],
            'Tags': [article['keywords'].split(',') for article in data['@graph']],
            'Author': [article['author'][0]['name'] for article in data['@graph']],
            'Date Published': [article['datePublished'] for article in data['@graph']],
            'Date Modified': [article['dateModified'] for article in data['@graph']],
            'Publisher': [article['publisher']['@id'] for article in data['@graph']],
            'Images': [[img['url'] for img in article['image']] for article in data['@graph']]
        })
    except KeyError as err:
        print(f"An error occurred: {err}")
        return None

    # Sleep to avoid rate limits
    time.sleep(random.uniform(1, 3))

    return df_page


In [10]:
new_df = pd.DataFrame(columns=['Title', 'Description', 'URL', 'Body', 'Tags', 'Author', 'Date Published', 'Date Modified', 'Publisher', 'Images'])

for url in df['URL']:
    scraped_data = scrape_webpage(url)
    if scraped_data is not None:
        new_df = pd.concat([new_df, scraped_data], ignore_index=True)


In [11]:
new_df.head()

Unnamed: 0,Title,Description,URL,Body,Tags,Author,Date Published,Date Modified,Publisher,Images
0,Libel judgments against Caruana Galizia heirs ...,Murdered journalist’s husband and sons could n...,https://timesofmalta.com/articles/view/libel-j...,Libel judgments that have ruled against Daphne...,"[National, Daphne Caruana Galizia, Court]",Times of Malta,2023-07-14T14:30:00+02:00,2023-07-14T15:33:16+02:00,https://timesofmalta.com#organization,[https://cdn-attachments.timesofmalta.com/48aa...
1,Daughter of murdered baron claims delays in ca...,Baron Francis Sant Cassia was murdered 35 year...,https://timesofmalta.com/articles/view/daughte...,The daughter of the late Baron Francis Sant Ca...,"[National, Court, Murder, Justice]",Edwina Brincat,2023-07-14T11:35:00+02:00,2023-07-14T11:48:48+02:00,https://timesofmalta.com#organization,[https://cdn-attachments.timesofmalta.com/af78...
2,Execution of Ohio man delayed until 2027,LaMar was convicted of killing five fellow inm...,https://timesofmalta.com/articles/view/executi...,A US state governor on Thursday postponed the ...,"[World, USA, Justice, Music]",AFP,2023-07-14T08:01:00+02:00,2023-07-14T12:36:02+02:00,https://timesofmalta.com#organization,[https://cdn-attachments.timesofmalta.com/3e28...
3,Two accused of attempted murder after Marsasca...,"Two Italian men were injured, one seriously, i...",https://timesofmalta.com/articles/view/two-acc...,Two men were remanded in custody on Thursday a...,"[National, Court, Crime, Marsascala]",Edwina Brincat,2023-07-13T18:17:00+02:00,2023-07-13T18:37:51+02:00,https://timesofmalta.com#organization,[https://cdn-attachments.timesofmalta.com/24d0...
4,Robert Abela vows to publish magisterial inqui...,Victim's mother was among those reacting to a ...,https://timesofmalta.com/articles/view/robert-...,Robert Abela on Thursday pledged to publish th...,"[National, Construction, Robert Abela, Acciden...",Sarah Carabott,2023-07-13T15:07:00+02:00,2023-07-13T16:07:07+02:00,https://timesofmalta.com#organization,[https://cdn-attachments.timesofmalta.com/e036...
