In [1]:
import pandas as pd
import numpy as np
import requests
import re
import time
import json
from bs4 import BeautifulSoup as bs

import warnings
warnings.filterwarnings("ignore") 

### Helper Functions for Data Scraping

For this Data Analysis on Canada Recalled Items (2011-2023), we will extract data from the [Recalls and safety alerts - Government of Canada](https://recalls-rappels.canada.ca/en/search/site) website using web scraping technique. Here are the functions for collecting data:


- `retrieve_data_from_version1(page)`
- `retrieve_data_from_version2(page)`
- `retrieve_data(page)`
- `get_webpage_content(link)`
- `get_all_links(category)`
- `collect_data(start, end, all_links)`

The webpage has 2 different layouts, which makes the structure of the content unconsistent. To be able to scrape all the data, we will need the two following methods `retrieve_data_from_version1(page)` for the old layout version and `retrieve_data_from_version2(page)` for the recent layout version.

In [2]:
def retrieve_data_from_version1(page):
    """
    This function extracts informations from the given webpage that has the old layout version (version 1)
    
    :param [page] : content of the webpage for data extraction
    :type [page] : BeautifulSoup
    
    :return : dictionary with extracted data 
    :rtype : dictionary
    
    """
    # create dictionary
    data = {}
    data['Date'] = page.find_all('time')[-1].text
    data['Item'] = page.find('h1').text
    category = page.find('div', {'class':'h3'}).text.lower() 
    data['Recall type'] = category
    
    # locate our features and values section in the webpage
    features = page.find_all('dt', {'class' : 'paddingNone'})
    values = page.find_all('dd', {'class' : 'paddingNone'})
    
    for f, v in zip(features, values):
        data[f.text] = v.text
        
    return data

In [3]:
def retrieve_data_from_version2(page):
    """
    This function extracts informations from the given webpage that has the recent layout version (version 2)
    
    :param [page] : content of the webpage for data extraction
    :type [page] : BeautifulSoup
    
    :return : dictionary with extracted data
    :rtype : dictionary
    
    """
    data = {}
    features, informations, date = None, None, None

    # Locate the 'Summary' and 'Additional Information' sections
    summary_section = page.find('div', {'class':'alert-info'}).find_all('div', 'field--item')
    additional_section = page.find('div', {'class': 'ar-additional-info'}).find_all('details')
    

    # Locate the 'Details' section
    for i in range(len(additional_section)):
        
        if additional_section[i].find('summary').text == 'Details': 
            details = additional_section[i]
            
            # Extract the informations from 'Details' section
            features = details.find_all('div', {'class': 'field--label'})
            informations = details.find_all('div', {'class': 'field--item'})
            break
            
    # Retrieve brand from the website
    if page.find('details', {'class':'ar-brand-details'}):
        brand = page.find('details', {'class':'ar-brand-details'}).find('div', 'field--item').text
    else:
        brand = np.nan
    
    # Retrieve number of affected items/models
    if page.find_all('table'): 
        nb_affected_products = len(page.find_all('table')[0].select('tr'))
    elif page.find_all('div', {'class': 'ar-affected-products'})[0].text != '\n':
        items = page.find_all('div', {'class': 'ar-affected-products'})
        nb_affected_products = len(items[0].select('div', {'class':'field--item'})[0].get_text().split(','))    
    else:
        nb_affected_products = 1
    
    
    # create dictionary
    data['Date'] = page.find_all('time')[-1].text
    data['Item'] = page.find('h1').text
    data['Brand'] = brand
    data['Product'] = summary_section[0].text
    data['Issue'] = summary_section[1].text
    data['Nb_affected_models'] = nb_affected_products
    
    for f, info in zip(features[:-2], informations[:-2]):
        info = info.text.lstrip().rstrip()
        data[f.text] = info

        
    category = page.find('div', {'class':'h3'}).text.lower() 
    data['Recall type'] = category
    if 'food' in category or 'vehicle' in category:
        data[features[-3].text] = informations[-4].text
    else:
        data[features[-2].text] = informations[-2].text
        
    return data

In [4]:
def retrieve_data(page):
    """
    This function extracts informations from the given webpage by checking which layout version it has.
    
    :param [page] : content of the webpage for data extraction
    :type [page] : BeautifulSoup
    
    :return : dataframe with all the extracted data from the given webpage
    :rtype : pd.DataFrame
    
    """
    
    item = None
    if page.find_all('div', {'class': 'recall-alert-body'}): 
        item = retrieve_data_from_version1(page)
    else:
        item = retrieve_data_from_version2(page)
    
    return pd.DataFrame([item])

In [5]:
def get_webpage_content(link):
    """
    This function fetches the webpage of the given link.
    
    :param [link] : url address of the webpage to fetch
    :type [link] : string
    
    :return : content of the webpage
    :rtype : BeautifulSoup
    
    """
    
    try:
        response = requests.get(link, verify=False)
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print(e.response.text)
    
    return bs(response.content, 'lxml')

### Get all the links and Save to JSON file

In [6]:
def get_all_links(category):
    """
    This function gets all the links of the https://recalls-rappels.canada.ca website for the given recall category
    
    :param [page] : recall category
    :type [page] : string
    
    :return : links 
    :rtype : list of strings
    
    """
    
    id_category = {'vehicles': 443, 'consumer': 101, 'medical': 180, 'food': 144}
    all_sublinks = []

    # fetch the website
    url = lambda page: f'https://recalls-rappels.canada.ca/en/search/site?f%5B0%5D=category%3A{id_category[category]}&search_api_fulltext=&archived=0&page=0%2C{page}'
    content = get_webpage_content(url(0))
        
    # get total number of pages
    pagination = content.find_all('li', {'class': 'pager__item'})     # locate pagination bar at the website bottom 
    last_page_link = pagination[-1].find('a')['href']                 # get the link to the last page
    index = last_page_link.find('2C')                                 # locate the page number
    total_pages = int(last_page_link[index + 2:])                     # extract the total number of pages from the link             
        
    # get all the links
    url_base ='https://recalls-rappels.canada.ca'
    for page in range(total_pages):
        page_content = get_webpage_content(url(page))
        for link in page_content.find_all('a'):
            sublink = str(link.get('href'))
            if sublink.startswith('/en/alert-recall'):
                all_sublinks.append(url_base + sublink)
                    
    print('Links all collected')
    return all_sublinks 

```python

# get all the links
medical_urls = get_all_links('medical')
vehicles_urls = get_all_links('vehicles')
food_urls = get_all_links('food')
consumer_urls = get_all_links('consumer')


#save the links to a JSON file
with open('links_medical.json', 'w') as file:
   json.dump(medical_urls, file)

with open('links_vehicles.json', 'w') as file:
   json.dump(vehicles_urls, file)

with open('links_food.json.json', 'w') as file:
   json.dump(food_urls, file)

with open('links_consumer.json', 'w') as file:
   json.dump(consumer_urls, file)
```

### Retrieve data 

```python
# open the JSON file
with open('data/raw_data/links_medical.json', 'r') as file:
    medical_urls = json.load(file)
    
with open('data/raw_data/links_vehicles.json', 'r') as file:
    vehicles_urls = json.load(file)

with open('data/raw_data/links_food.json', 'r') as file:
    food_urls = json.load(file)
    
with open('data/raw_data/links_consumer.json', 'r') as file:
    consumer_urls = json.load(file)
```

In [9]:
def collect_data(start, end, all_links):
    
    """
    This function collects data from the given list of links (read from JSON file) by batch. 
    Due to the large size of data to collect, it would be helpful to collect data 
    by "smaller" batch by setting the start/end of the urls list.
    
    :param [page] : list of links to fetch
    :type [page] : list
    
    :return : dataset with all data extracted from the given list of urls
    :rtype : pd.DataFrame
    
    """
    
    all_items_df = pd.DataFrame()

    for link in all_links[start: end]:

        content = get_webpage_content(link)
        items_df = retrieve_data(content)
        all_items_df = pd.concat([all_items_df, items_df], axis = 0)
        time.sleep(2)

    print('Pages successfully fetched')
    return all_items_df

**NOTE:**
- There is a total of 14,218 recalled items to collect from the website from 2011 to March 2023.
- For this Data Analysis project, we scraped the data by batches of 1,000 urls at a time for each category.