# Digital Shift: The Evolution of Products and Platforms in Portuguese E-commerce



### Data Processor:



In [4]:
sites = ["www.fnac.pt", "www.worten.pt", "www.elcorteingles.pt", "www.radiopopular.pt", "www.staples.pt", "www.pcdiga.com"]

In [1]:
import requests
import urllib.parse
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import re

 ## Extracting product categories from the websites (2007 | 2010 | 2015 | 2020 | 2023)

We will use the BeautifulSoup library to extract the product categories from the websites, and we will use the requests library to get the HTML content of the websites.
We had to create a different script for each year and site, because the structure of the sites is different for each year and site.

In [3]:
df = pd.read_csv("data/sites_links.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,www.fnac.pt,www.worten.pt,www.elcorteingles.pt,www.radiopopular.pt,www.staples.pt,www.pcdiga.com
0,2005,,,https://arquivo.pt/wayback/20050725031922/http...,https://arquivo.pt/wayback/20050722223614/http...,https://arquivo.pt/wayback/20050722174753/http...,https://arquivo.pt/wayback/20050719124815/http...
1,2006,https://arquivo.pt/wayback/20061118120805/http...,,https://arquivo.pt/wayback/20060216170739/http...,,,
2,2007,https://arquivo.pt/wayback/20070928223117/http...,https://arquivo.pt/wayback/20070611190104/http...,https://arquivo.pt/wayback/20070929080902/http...,https://arquivo.pt/wayback/20070929122436/http...,https://arquivo.pt/wayback/20070610185333/http...,
3,2008,https://arquivo.pt/wayback/20081027081756/http...,https://arquivo.pt/wayback/20081022044251/http...,https://arquivo.pt/wayback/20081021184312/http...,https://arquivo.pt/wayback/20081022013802/http...,https://arquivo.pt/wayback/20081022031557/http...,https://arquivo.pt/wayback/20081022130111/http...
4,2009,https://arquivo.pt/wayback/20091218064527/http...,https://arquivo.pt/wayback/20091218174523/http...,https://arquivo.pt/wayback/20091218054927/http...,https://arquivo.pt/wayback/20091218134419/http...,https://arquivo.pt/wayback/20091218154357/http...,https://arquivo.pt/wayback/20091219171727/http...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            19 non-null     int64 
 1   www.fnac.pt           16 non-null     object
 2   www.worten.pt         14 non-null     object
 3   www.elcorteingles.pt  16 non-null     object
 4   www.radiopopular.pt   17 non-null     object
 5   www.staples.pt        18 non-null     object
 6   www.pcdiga.com        16 non-null     object
dtypes: int64(1), object(6)
memory usage: 1.2+ KB


In [5]:
print(df.isna().sum())
df.isna()

Unnamed: 0              0
www.fnac.pt             3
www.worten.pt           5
www.elcorteingles.pt    3
www.radiopopular.pt     2
www.staples.pt          1
www.pcdiga.com          3
dtype: int64


Unnamed: 0.1,Unnamed: 0,www.fnac.pt,www.worten.pt,www.elcorteingles.pt,www.radiopopular.pt,www.staples.pt,www.pcdiga.com
0,False,True,True,False,False,False,False
1,False,False,True,False,True,True,True
2,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False
7,False,False,False,False,False,False,True
8,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False


In [6]:
df.columns

Index(['Unnamed: 0', 'www.fnac.pt', 'www.worten.pt', 'www.elcorteingles.pt',
       'www.radiopopular.pt', 'www.staples.pt', 'www.pcdiga.com'],
      dtype='object')

In [None]:
header = ['ano', 'link', 'site', 'numero_categorias', 'lista_categorias', 'dicionario_subcategorias']
header_df = pd.DataFrame(columns=header)
header_df.to_csv("data/ecommerce_category_analysis_all.csv", index=False)


WORTEN

In [None]:
def extractor_2007_worten(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('ul', class_='menu')

        categories_dict = {}

        for category in categories_container.find_all('li'):
            main_category_name = category.find('a').get_text(strip=True)
            
        # nao tem subcategorias
            categories_dict[main_category_name] = []

        return len(categories_dict), categories_dict
    except Exception as e:
        return 0, {}
    
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year <= 2012:
                link = link + 'Splash.aspx'
            if year == 2007:
                num_categories, category_dict = extractor_2007_worten(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2010_worten(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', class_='menu menu-category')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='sub'):
                main_category_name = category.find('a', class_='label').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li'):
                        # Extract subcategory name
                        subcategory_name = subcategory.find('a', class_='label').get_text(strip=True)
                        subcategories.append(subcategory_name)

                categories_dict[main_category_name] = subcategories

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            link = link + 'default.aspx'    
            if year == 2010:
                num_categories, category_dict = get_categories_2010_worten(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2015_worten(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', id='nav')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='level1'):
                main_category_name = category.find('a').find('span').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li', class_='level2'):
                        # Extract subcategory name
                        subcategory_name = subcategory.find('span').get_text(strip=True)
                        subcategories.append(subcategory_name)
                categories_dict[main_category_name] = subcategories

            # se houver subcategorias com [] é porque não tem subcategorias, e eliminar categoria
            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
                    
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (year not in category_analysis['ano'].values or site_column not in category_analysis['site'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_worten(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)




In [None]:
def get_categories_worten_2020_23(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', class_='nav-sub js-nav-sub')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='nav-item nav-item-sub'):
                
                main_category_name = category.find('span', class_='nav-a').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul', class_='nav nav-sub nav-sub-child js-nav-sub')
                for sub in subcategories_container.find_all('li', class_='nav-item-sub'):
                    label = sub.find('label', class_='nav-trigger js-nav-trigger')
                    if label:
                        subcategory_name = label.find('a', class_='nav-a')
                        if subcategory_name:
                            subcategories.append(subcategory_name.get_text(strip=True).lower())

                categories_dict[main_category_name] = subcategories

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}

        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (year not in category_analysis['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2020 or year == 2023:
                num_categories, category_dict = get_categories_worten_2020_23(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)




Staples Extracted Categories:

In [None]:
def get_categories_staples_2007_10(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('span', id= lambda x: x and 'categorias' in x.lower())

        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('div', class_='tracos_centro'):
            a_tag = div.find('a')
            if not a_tag:
                continue

            category_name = a_tag.get_text(strip=True)

            if 'tit_centro_blue_bold' in a_tag.get('class', []):
                # Main category
                current_main_category = category_name
                categories_dict[current_main_category] = []
            else:
                # Subcategory
                if current_main_category:
                    categories_dict[current_main_category].append(
                         category_name
                    )

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}


category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            link += 'default.aspx'
            if year == 2007 or year == 2010:
                num_categories, category_dict = get_categories_staples_2007_10(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_staples_15(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='primaryNav')

        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('span', class_='navItem'):
            current_main_category = div.find('a', class_='navLink').get_text(strip=True)
            categories_dict[current_main_category] = []

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2015:
                num_categories, category_dict = get_categories_staples_15(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_staples_20(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='primaryNav')
        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('span', class_='navItem'):
            current_main_category = div.find('a', class_='navLink').get_text(strip=True)
            categories_dict[current_main_category] = []


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2020:
                num_categories, category_dict = get_categories_staples_20(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_staples_23(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='container-menu-children')
        print(categories_container)
        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('div', class_='children-cont'):
            main_category_tag = div.find('span', class_='pr-name')
            if main_category_tag:
                current_main_category = main_category_tag.get_text(strip=True)
                categories_dict[current_main_category] = []
            else:
                if current_main_category:
                    subcategory_link = div.find('a', class_='menu-link_a')
                    if subcategory_link:
                        subcategory_name = subcategory_link.get_text(strip=True)
                        categories_dict[current_main_category].append(subcategory_name)


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2023:
                num_categories, category_dict = get_categories_staples_23(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

PcDiga Extracted Categories:
- As it is not possible to extract categories from 2007, we will extract from 2008, assuming that the differences between 2007 and 2008 are minimal.
- As it is not possible to extract categories from 2023, we assume that the differences between 2020 and 2023 are minimal, or probably the same.


In [None]:
def get_categories_pcdiga_08(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('map', attrs={'name': 'Map'})

        categories_dict = {}

        for category in categories_container.find_all('area'):
            # extrair o nome do href = ...?Familia=nome
            category_href = category
            main_category_name = category.get('href').split('=')[-1]

            subcategories = []

            # a categoria principal não tem subcategorias
            categories_dict[main_category_name] = subcategories


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            link += 'pcdiga/'
            if year == 2008 or year == 2010:
                n_year = year - 1 if year == 2008 else year
                num_categories, category_dict = get_categories_pcdiga_08(link)

                category_data.append({
                    'ano': n_year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



In [None]:
def get_categories_pcdiga_15(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('div', id='masterdiv')

        categories_dict = {}

        for category in categories_container.find_all('table', class_='menu1'):
            main_category_name = category.find('a', class_='menu').get_text(strip=True)
            subcategories = []
            categories_dict[main_category_name] = subcategories


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2015:

                num_categories, category_dict = get_categories_pcdiga_15(link)
                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



In [None]:
def get_categories_pcdiga_20(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories = soup.find('div', class_='megamenu-wrapper')
        categories_container = categories.find('ul', class_='megamenu')
        categories_dict = {}

        for category in categories_container.find_all('li'):
            main_category_link = category.find('a', class_='i-link')
            svg_element = category.find('span', class_='svg-i svg-arrow')
            if not main_category_link or not svg_element:
                continue  

            main_category_name = main_category_link.find('span', class_='lnk-text').get_text(strip=True)

            subcategories = []
            subcategories_container = category.find('div', class_='submenu')
            if subcategories_container:
                for subcategory in subcategories_container.find_all('li'):
                    subcategory_link = subcategory.find('a', class_='i-link')
                    if subcategory_link:
                        subcategory_name = subcategory_link.find('span').get_text(strip=True)
                        subcategories.append(subcategory_name)  

            categories_dict[main_category_name] = subcategories

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2020:
                num_categories, category_dict = get_categories_pcdiga_20(link)
                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



RadioPopular Extracted Categories:


In [None]:
def get_categories_2010_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            for category in soup.find_all('td', align='left'):
                a = category.find('a')
                if a:
                    main_category_name = a.find('img').get('alt')
                    categories_dict[main_category_name] = []
            for i in list(categories_dict.keys()):
                if i == 'Recrutamento':
                    del categories_dict[i]
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2010 or year == 2007:
                num_categories, category_dict = get_categories_2010_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2015_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            # Encontrar o div com as categorias principais
            div = soup.find('div', id='nav')
            if div is None:
                return 0, {} 

            for category in div.find_all('li', class_='dir'):
                main_category_name = category.find('a').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li', class_='dir'):
                        subcategory_name = subcategory.find('a').get_text(strip=True)
                        subcategories.append(subcategory_name)

                if subcategories:
                    categories_dict[main_category_name] = subcategories
                else:
                    categories_dict[main_category_name] = []

            # vamos ver todas as keys que tem [] e eliminar, pois não tem subcategorias
            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
                    
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2020_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            # Encontrar o div com as categorias principais
            ul = soup.find('ul', class_='categories')

            for i in ul.find_all('li', class_='category link cb'):
                main_category_name = i.find('a').get_text(strip=True)
                subcategories = []
                div = i.find('div', class_='subcategories')
                if div:
                    for subcategory in div.find_all('li', class_="subcategory family link"):
                        subcategory_name = subcategory.get_text(strip=True)
                        subcategories.append(subcategory_name)
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')    
            if year == 2020 or year == 2023:
                num_categories, category_dict = get_categories_2020_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

As elcorreioingles does not have product categories, we will not use this site to make this comparison

Fnac Extracted Categories:


As is not possible to extract categories, from 2006 and 2008, we will not use the year 2007 in fnac to make the comparison

Error message presented:```O seu browser não aceita cookies, pelo que não é possível o acesso ao nosso site.```


In [None]:
def get_categories_2010_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', id='menu')
            u = div.find('ul', style="margin-left: 40px")
            categories_dict = {}
            main_category_name = None
            for category in u.find_all('li'):
                a = category.find('a')
                if a:
                    main_category_name = a.get_text(strip=True)
                subcategories = []
                ul = category.find('ul')
                if ul:
                    for subcategory in ul.find_all('li'):
                        subcategory_name = subcategory.find('a').get_text(strip=True)
                        subcategory_name = subcategory_name.replace('»', '').replace('\r\n', '').strip()
                        subcategory_name = ' '.join(subcategory_name.split())
                        subcategories.append(subcategory_name)
                    if subcategories:
                        categories_dict[main_category_name] = subcategories
                    else:
                        categories_dict[main_category_name] = []

            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
            for value in categories_dict.values():
                if 'Ver todos os produtos' in value:
                    value.remove('Ver todos os produtos')

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2010:
                num_categories, category_dict = get_categories_2010_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2015_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', id='MENU')
            u = div.find('ul', id="onglets")
            categories_dict = {}
            main_category_name = None
            for category in u.find_all('li'):
                a = category.find('a')
                if a:
                    span = a.find('span', class_='inner')
                    main_category_name = a.get_text(strip=True)
                subcategories = []
                
                div = category.find('div', class_='megaMenu')
                if div:
                    for sub in div.find_all('dt'):
                        # pode ja ter subcategoria ou ter um <a> com a subcategoria
                        subcategory_name = sub.get_text(strip=True)
                        if subcategory_name:
                            subcategories.append(subcategory_name)

                    if subcategories:
                        categories_dict[main_category_name] = subcategories
                    else:
                        categories_dict[main_category_name] = []

            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2020_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', class_='Sidebar-nano nano')
            categories_dict = {}
            main_category_name = None

            ul = div.find('ul', class_="CategoryNav js-CategoryNav")
            for il in ul.find_all('li', class_='CategoryNav-item js-CategoryNav-item'):
                main_category_name = il.find('a').get_text(strip=True)
                subcategories = []  
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2020:
                num_categories, category_dict = get_categories_2020_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2023_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', class_='SideNavPanel-listWrapper')
            categories_dict = {}
            main_category_name = None

            ul = div.find('ul', class_="SideNavPanel-list")
            for il in ul.find_all('li', class_='SideNavPanel-listItem js-SideNavPanel-listItem'):
                main_category_name = il.find('a').get_text(strip=True)
                subcategories = []  
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2023:

                num_categories, category_dict = get_categories_2023_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

 ## Extracting product prices

We will use the BeautifulSoup library to extract the product prices from the websites, and we will use the requests library to get the HTML content of the websites.


In [6]:
df = pd.read_csv('data/smartphones_arquivo.csv')

In [7]:
df.head()

Unnamed: 0,site,model,periodo,title,originalURL,linkToArchive,tstamp,contentLength,digest,mimeType,...,date,linkToScreenshot,linkToNoFrame,linkToExtractedText,linkToMetadata,linkToOriginalFile,snippet,fileName,collection,offset
0,www.fnac.pt,huawei p8,2016-2020,Huawei P8 Lite Huawei - Tecnologia - Fnac.pt,https://www.fnac.pt/Smartphones-e-Telemoveis/S...,https://arquivo.pt/wayback/20170819112525/http...,20170819112525,188977,d129cd257066d54e046fd147e20e50a0,text/html,...,1503141925,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201708191125...,https://arquivo.pt/textextracted?m=https%3A%2F...,https://arquivo.pt/textsearch?metadata=https%3...,https://arquivo.pt/noFrame/replay/201708191125...,<em>Huawei</em> <em>P8</em> Lite <em>Huawei</e...,IAH-20170819112306-68006-p81.arquivo.pt,AWP24,3773684
1,www.fnac.pt,huawei p8,2016-2020,"HUAWEI TELEMOVEL HUAWEI P8 DS 64GB, SmartPhone...",http://www.fnac.pt/HUAWEI-TELEMOVEL-HUAWEI-P8-...,https://arquivo.pt/wayback/20160211013245/http...,20160211013245,168566,e7cba63ab0ad2ff865ee01fc24aaeb21,text/html,...,1455154365,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201602110132...,https://arquivo.pt/textextracted?m=http%3A%2F%...,https://arquivo.pt/textsearch?metadata=http%3A...,https://arquivo.pt/noFrame/replay/201602110132...,<em>HUAWEI</em> TELEMOVEL <em>HUAWEI</em> <em>...,IAH-20160211012526-43988-p81.arquivo.pt,AWP20,44697354
2,www.fnac.pt,huawei p20,2016-2020,Smartphone Huawei P20 - 128GB - Black - SmartP...,https://www.fnac.pt/Smartphone-Huawei-P20-128G...,https://arquivo.pt/wayback/20190422222720/http...,20190422222720,311043,85b8ba9669ccbedeae1fcd8fe1d10abd,text/html,...,1555972040,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201904222227...,https://arquivo.pt/textextracted?m=https%3A%2F...,https://arquivo.pt/textsearch?metadata=https%3...,https://arquivo.pt/noFrame/replay/201904222227...,Smartphone <em>Huawei</em> <em>P20</em> - 128G...,WEB-20190422222653112-p81.arquivo.pt,AWP29,63201824
3,www.fnac.pt,huawei p20,2016-2020,Smartphone Huawei P20 Pro - 128GB - Black - Sm...,https://www.fnac.pt/Smartphone-Huawei-P20-Pro-...,https://arquivo.pt/wayback/20190422222711/http...,20190422222711,293053,2530c422432b25e633797fa5469eb6b5,text/html,...,1555972031,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201904222227...,https://arquivo.pt/textextracted?m=https%3A%2F...,https://arquivo.pt/textsearch?metadata=https%3...,https://arquivo.pt/noFrame/replay/201904222227...,Smartphone <em>Huawei</em> <em>P20</em> Pro - ...,WEB-20190422222653112-p81.arquivo.pt,AWP29,19579119
4,www.fnac.pt,huawei p30,2016-2020,Novos Huawei P30 - Sabe mais em Fnac.pt,https://www.fnac.pt/novos-huawei,https://arquivo.pt/wayback/20190323202914/http...,20190323202914,57729,38aaee64d7aa7b777d18759f0de09b53,text/html,...,1553372954,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201903232029...,https://arquivo.pt/textextracted?m=https%3A%2F...,https://arquivo.pt/textsearch?metadata=https%3...,https://arquivo.pt/noFrame/replay/201903232029...,Novos <em>Huawei</em> <em>P30</em> - Sabe mais...,WEB-20190323202843497-p81.arquivo.pt,AWP29,87173605


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   site                 176 non-null    object
 1   model                176 non-null    object
 2   periodo              176 non-null    object
 3   title                176 non-null    object
 4   originalURL          176 non-null    object
 5   linkToArchive        176 non-null    object
 6   tstamp               176 non-null    int64 
 7   contentLength        176 non-null    int64 
 8   digest               176 non-null    object
 9   mimeType             176 non-null    object
 10  encoding             176 non-null    object
 11  date                 176 non-null    int64 
 12  linkToScreenshot     176 non-null    object
 13  linkToNoFrame        176 non-null    object
 14  linkToExtractedText  176 non-null    object
 15  linkToMetadata       176 non-null    object
 16  linkToOr

<br>

Initially, we attempted to extract product prices from all pages using links to the extracted text, but due to a lack of consistent patterns, this approach was unsuccessful. In some cases, there were multiple products on a single page, while others contained only one. To maximize extraction, we created a web scraping script tailored to each page type. The script efficiently handles various page layouts, extracting prices for all products on multi-product pages and retrieving the price of the sole product on single-product pages.

<br>

# Fnac Product Prices



In [59]:
def extract_prices(text, site):
    soup = BeautifulSoup(text, 'html.parser')

    price_container = soup.find("div", id="ArticleHeaderPT_FullPricerControl_ctl00_BuyBoxMarketPlace_OfferPrice1")
    if price_container:
        price_single = price_container.find("span", class_="price")
        if price_single:
            price = price_single.get_text(strip=True)
            print('1.', price)
            return {'current_price': price}
        
    price_container_box = soup.find("div", class_="ProductPriceBox")
    if price_container_box:
        price_single = price_container_box.find("strong", class_="product-price")
        if price_single:
            price = price_single.get_text(strip=True)
            print('1.', price)
            return {'current_price': price}

    price_box = soup.find("div", class_="f-priceBox")
    if price_box:
        old_price = price_box.find("span", class_="f-priceBox-price f-priceBox-price--old")
        current_price = price_box.find("span", class_="f-priceBox-price f-priceBox-price--reco checked")
        if old_price and current_price:
            print('2.',old_price.get_text(strip=True), current_price.get_text(strip=True))
            return {'old_price': old_price.get_text(strip=True), 'current_price': current_price.get_text(strip=True)}
        else:
            print('3.',current_price.get_text(strip=True))
            return {'current_price': current_price.get_text(strip=True)}
    
    product_list = soup.find_all("li", class_="clearfix Article-item") or soup.find_all("div", class_="Article-itemGroup")
    if product_list:
        products = []
        for product in product_list:
            name_tag = product.find("a", class_="js-minifa-title") or product.find("p", class_="Article-desc")
            product_name = name_tag.get_text(strip=True) if name_tag else None

            old_price_tag = product.find("span", class_="oldPrice")
            current_price_tag = product.find("a", class_="userPrice") or product.find("strong", class_="userPrice")
            if old_price_tag and current_price_tag:
                old_price = old_price_tag.get_text(strip=True)
                current_price = current_price_tag.get_text(strip=True)
                print('4.',product_name, old_price, current_price)
                products.append({
                'product_name': product_name,
                'old_price': old_price,
                'current_price': current_price
                })
            elif current_price_tag and not old_price_tag:
                old_price = None
                current_price = current_price_tag.get_text(strip=True)
                print('5.',product_name, current_price)
                products.append({
                    'product_name': product_name,
                    'current_price': current_price
                })
        
        return products if products else None

    blk_inside = soup.find("div", class_="blk_inside")
    if blk_inside:
        products = []
        for produit in blk_inside.find_all("div", class_="produit"):
            name_tag = produit.find("dt")
            product_name = name_tag.get_text(strip=True) if name_tag else None
            
            price_tag = produit.find("span", class_="smallPrice")
            price = price_tag.get_text(strip=True) if price_tag else None
            print('6.',product_name, price)
            products.append({
                'product_name': product_name,
                'current_price': price
            })
        
        return products if products else None

    product_list = soup.find_all("div", class_="thumbnail")
    if product_list:
        products = []
        for product in product_list:
            name_tag = product.find("a", class_="thumbnail-titleLink")
            product_name = name_tag.get_text(strip=True) if name_tag else None
            
            price_tag = product.find("span", class_="thumbnail-price")
            old_price_tag = product.find("del", class_="thumbnail-priceOld")
            
            current_price = price_tag.get_text(strip=True) if price_tag else None
            old_price = old_price_tag.get_text(strip=True) if old_price_tag else None

            if current_price and old_price:
                print('7.',product_name, old_price, current_price)
                products.append({
                    'product_name': product_name,
                    'old_price': old_price,
                    'current_price': current_price
                })
            elif current_price and not old_price:
                print('8.',product_name, current_price)
                products.append({
                    'product_name': product_name,
                    'current_price': current_price
                })
        return products if products else None

    return None


In [None]:
def process_row_FNAC(df):
    extracted_data_list = []
    for index, row in df.iterrows():
        link = row['linkToArchive']
        site = row['site'] 
        link = link.replace(f'/https://{site}/', f'mp_/https://{site}/')
        tstamp = str(row['tstamp'])
        date = tstamp[:8]
        title = row['title']
        snippet = row['snippet']

        
        if site == 'www.fnac.pt':
            try:
                response = requests.get(link)
                response.raise_for_status()
                text = response.text
            except requests.RequestException:
                print(f"Error while fetching {link}")
                extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': title,
                            'extractedData': None,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
            
            extracted_data = extract_prices(text, site)
            
            if extracted_data:
                if isinstance(extracted_data, list):
                    for product in extracted_data:
                        old = product['old_price'] if 'old_price' in product else None
                        current = product['current_price'] if 'current_price' in product else None
                        price = [current, old] if old and current else [current]
                        titulo = product['product_name'] if 'product_name' in product else None
                        extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': titulo,
                            'extractedData': price,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
                else:
                    old = extracted_data['old_price'] if 'old_price' in extracted_data else None
                    current = extracted_data['current_price'] if 'current_price' in extracted_data else None
                    price = [current, old] if old and current else [current]
                    
                    extracted_data_list.append({
                        'site': site,
                        'date': date,
                        'title': title,
                        'extractedData': price,
                        'linkToArchive': link,
                        'linkToExtractedText': row['linkToExtractedText'],
                        'snippet': snippet
                    })

            else:
                extracted_data_list.append({
                    'site': site,
                    'date': date,
                    'title': title,
                    'extractedData': None,
                    'linkToArchive': link,
                    'linkToExtractedText': row['linkToExtractedText'],
                    'snippet': snippet
                })
    
    return extracted_data_list

In [None]:
df = pd.read_csv('data/smartphones_arquivo.csv')        
result_list = process_row_FNAC(df)               
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']

df_result.to_csv('data/products_extracted_prices_FNAC.csv', index=False)


4. Huawei P8 Lite 2017 - Black 259,99 € 219,99 €
4. Huawei P8 Lite (Black) 229,99 € 179,99 €
4. Huawei P8 Lite (Gold) 229,99 € 179,99 €
4. Huawei P8 Lite 2017 - Gold 259,99 € 219,99 €
4. Huawei P8 Lite (White) 229,99 € 179,99 €
4. Huawei P8 Lite 2017 - White 259,99 € 219,99 €
5. Huawei P8 Lite 16GB 4G Preto 169,90 €
5. Smartphone Huawei P8 Lite Dual Sim 2GB 16GB Dourado 179,30 €
5. Huawei P8 Lite 16GB 4G Branco 169,93 €
5. Huawei P8 Lite 2017 Dual SIM 4G 16GB Azul 286,68 €
5. smartphone Huawei P8 Lite 772997 4G 16GB Preto 283 €
5. Huawei P8 Lite 2017 4G 16GB Dourado 278 €
5. Huawei P8 Lite 2017 4G 16GB 278 €
5. Huawei P8 Lite 2017 4G 16GB 278 €
5. Smartphone HUAWEI P8 Lite 4G 16GB (Preto) 172,90 €
5. Huawei P8 Lite 4G 16GB Branco 169,90 €
5. Huawei P8 Lite 16GB 4G Preto 169,90 €
3. 599,99 €
3. 799,99 €
2. 379,99 € 349,99 €
3. 1 049,99 €
3. 1 049,99 €
4. Samsung Galaxy S8 - G950FZ - Preto Meia-Noite 719,99 € 549,99 €
4. Samsung Galaxy S8+ - G955FZ - Prateado 819,99 € 649,99 €
4. Samsung

In [None]:
df = pd.read_csv('data/resultados_black_friday_arquivo.csv')

result_list = process_row_FNAC(df)               
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']

df_result.to_csv('data/products_extracted_prices_FNAC_Black_Friday.csv', index=False)

2. 349,99 € 299 €
2. 349,99 € 299 €
7. Apple iPhone 8 - 64GB - Cinzento Sideral 549,99 € 499,99 €
7. Apple iPhone XS - 64GB - Cinzento Sideral 1 069,99 € 829,99 €
7. Apple iPhone 7 - 32GB (Preto Mate) 449,99 € 349,99 €
7. Apple iPhone 8 - 64GB - Prateado 549,99 € 499,99 €
7. Apple iPhone 7 - 32GB (Rosa Dourado) 449,99 € 349,99 €
7. Apple iPhone 7 - 32GB (Dourado) 449,99 € 349,99 €
7. Apple iPhone 7 - 32GB (Prateado) 449,99 € 349,99 €
7. Apple iPhone 8 - 64GB - Dourado 549,99 € 499,99 €
7. Apple iPhone XS - 64GB - Dourado 1 069,99 € 829,99 €
7. Apple iPhone XS - 64GB - Prateado 1 069,99 € 829,99 €
7. Apple MacBook Air 13'' i5-1,8GHz | 8GB | 128GB 1 129 € 839 €
7. Apple MacBook Pro 13'' Retina i5-1,4GHz | 16GB | 256GB | Intel Iris Plus Graphics 645 com Touch Bar e Touch ID - Cinzento Sideral 2 039 € 1 729 €
7. Apple MacBook Air 13'' Retina | i5-1,6GHz | 8GB | 128GB - Cinzento Sideral 1 279 € 999 €
7. Apple MacBook Pro 15'' Retina i7-2,6GHz | 16GB | 512GB | Radeon Pro 555X com Touch Bar e

# Worten Product Prices

In [None]:
def extract_prices_worten(text, site):
    soup = BeautifulSoup(text, 'html.parser')

    price_container = soup.find("div", class_="w-product-general_info")
    if price_container:
        old_price_tag = price_container.find("span", class_="w-product__price__old")
        current_price_tag = price_container.find("span", class_="w-product__price__current")
        
        old_price = old_price_tag.get_text(strip=True) if old_price_tag else None
        current_price = current_price_tag.get_text(strip=True) if current_price_tag else None
        
        if current_price:
            print('1.', old_price, current_price)
            return {'old_price': old_price, 'current_price': current_price}

    price_container_box = soup.find("div", class_="w-product__price")
    if price_container_box:
        old_price_tag = price_container_box.find("span", class_="w-product__price__old")
        current_price_tag = price_container_box.find("span", class_="w-product__price__current")
        
        old_price = old_price_tag.get_text(strip=True) if old_price_tag else None
        current_price = current_price_tag.get_text(strip=True) if current_price_tag else None
        
        if current_price:
            print('2.', old_price, current_price)
            return {'old_price': old_price, 'current_price': current_price}

    price_box = soup.find("div", class_="price-box")
    if price_box:
        old_price_tag = price_box.find("p", class_="old-price")
        special_price_tag = price_box.find("p", class_="special-price")
        regular_price_tag = price_box.find("p", class_="regular-price")
        
        if old_price_tag and special_price_tag:
            old_price = old_price_tag.find("span", class_="price").get_text(strip=True)
            current_price = special_price_tag.find("span", class_="price").get_text(strip=True)
            print('3.', old_price, current_price)
            return {'old_price': old_price, 'current_price': current_price}
        
        elif regular_price_tag:
            current_price = regular_price_tag.find("span", class_="price").get_text(strip=True)
            print('4.', current_price)
            return {'current_price': current_price}
        
    products_container = soup.find_all("div", class_="w-product__content")
    if products_container:
        products = []
        for product in products_container:
            name_tag = product.find("div", class_="w-product__title__wrapper")
            price_tag = product.find("span", class_="w-currentPrice")
            
            product_name = name_tag.get_text(strip=True) if name_tag else None
            current_price = price_tag.get_text(strip=True) if price_tag else None
            
            if product_name and current_price:
                print('5.', product_name, current_price)
                products.append({
                    'product_name': product_name,
                    'current_price': current_price
                })
        
        return products if products else None


    return None


def process_row_worten(df):
    extracted_data_list = []
    for index, row in df.iterrows():
        link = row['linkToArchive']
        site = row['site'] 
        link = link.replace(f'/https://{site}/', f'mp_/https://{site}/')
        tstamp = str(row['tstamp'])
        date = tstamp[:8]
        title = row['title']
        snippet = row['snippet']

        
        if site == 'www.worten.pt':
            try:
                response = requests.get(link)
                response.raise_for_status()
                text = response.text
            except requests.RequestException:
                print(f"Error while fetching {link}")
                extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': title,
                            'extractedData': None,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
                continue
            
            extracted_data = extract_prices_worten(text, site)
            
            if extracted_data:
                if isinstance(extracted_data, list):
                    for product in extracted_data:
                        old = product['old_price'] if 'old_price' in product else None
                        current = product['current_price'] if 'current_price' in product else None
                        price = [current, old] if old and current else [current]
                        titulo = product['product_name'] if 'product_name' in product else None
                        extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': titulo,
                            'extractedData': price,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
                else:
                    old = extracted_data['old_price'] if 'old_price' in extracted_data else None
                    current = extracted_data['current_price'] if 'current_price' in extracted_data else None
                    price = [current, old] if old and current else [current]
                    
                    extracted_data_list.append({
                        'site': site,
                        'date': date,
                        'title': title,
                        'extractedData': price,
                        'linkToArchive': link,
                        'linkToExtractedText': row['linkToExtractedText'],
                        'snippet': snippet
                    })

            else:
                extracted_data_list.append({
                    'site': site,
                    'date': date,
                    'title': title,
                    'extractedData': None,
                    'linkToArchive': link,
                    'linkToExtractedText': row['linkToExtractedText'],
                    'snippet': snippet
                })
    
    return extracted_data_list


In [None]:
df = pd.read_csv('data/smartphones_arquivo.csv')        
result_list = process_row_worten(df)               
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']
df_result.to_csv('data/products_extracted_prices_WORTEN.csv', index=False)


Error while fetching https://arquivo.pt/wayback/20160221001735mp_/https://www.worten.pt/smartphone-huawei-p8-lite-dourado.html
2. €229,99 €199,99
2. €699,99 €579,99
2. €699,99 €579,99
2. €999,99 €849,99
2. €1049,00 €899,99
2. €1049,00 €899,99
2. €719,99 €585,99
2. €719,99 €599,99
2. None €969,99
2. None €869,99
2. €909,00 €599,99
2. €909,00 €599,99
Error while fetching https://arquivo.pt/wayback/20151209031224mp_/https://www.worten.pt/iphone-6-apple-64gb-space-grey.html
2. None €829,99
2. None €829,99
2. None €1179,00
2. None €1359,00
2. None €879,99
2. None €1349,00
2. None €1349,00
2. None €1179,00
2. None €879,99
2. None €1049,00
2. None €379,99
5. Consola PS4 Pro 1 TB + Jogo Red Dead Redemption 2 €439,99
5. Consola PS4 Slim 500 GB + Jogo Fortnite €299,99
5. Consola PS4 Pro 1 TB + Voucher Jogo Fortnite €399,99
5. Consola PS4 Slim 1TB Marvel's Spider-Man (Edição Limitada) €379,99
5. Consola PS4 Red Dead Redemption 2 €349,99
5. Consola PS4 Slim 1TB + Jogo Marvel's Spider-Man €349,99
5

In [None]:
df = pd.read_csv('data/resultados_black_friday_arquivo.csv')
result_list = process_row_worten(df)               
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']
df_result.to_csv('data/products_extracted_prices_WORTEN_Black_Friday.csv', index=False)

5. Recondicionado - Smartphone iPhone 6 16GB Cinzento sideral €179,99
5. Recondicionado - Smartphone iPhone 8 64GB Dourado €579,99
5. Recondicionado - Smartphone iPhone 6 64GB Prateado €259,99
5. Recondicionado - Smartphone iPhone 6 64GB Cinzento sideral €259,99
5. Recondicionado - Smartphone iPhone 7 128GB Prateado €449,99
5. Recondicionado - Smartphone iPhone SE 16GB Rosa Dourado €199,99
5. Recondicionado - Smartphone iPhone 6 64GB Dourado €259,99
5. Recondicionado - Smartphone iPhone 7 32GB Preto €379,99
5. Recondicionado - Smartphone iPhone 6s 32GB Cinzento sideral €289,99
5. Recondicionado - Smartphone iPhone 6s 16GB Rosa Dourado €249,99
5. Recondicionado - Smartphone iPhone SE 16GB Prateado €199,99
5. Recondicionado - Smartphone Samsung Galaxy S7 Edge 32GB Rosa €379,99
5. Recondicionado - Smartphone iPhone 6s 16GB Cinzento Sideral €249,99
5. Recondicionado - Smartphone Samsung Galaxy S7 Edge 32GB Dourado €379,99
5. Recondicionado - Smartphone iPhone SE 16GB Dourado €199,99
5. Rec

# El Corte Inglês Product Prices

In [None]:

def extract_prices_elcorteingles(text, site):
    soup = BeautifulSoup(text, 'html.parser')

    price_container = soup.find("div", class_="product-price")
    if price_container:
        current_price_tag = price_container.find("span", class_="current")
        old_price_tag = price_container.find("span", class_="former stroked")
        
        current_price = current_price_tag.get_text(strip=True) if current_price_tag else None
        old_price = old_price_tag.get_text(strip=True) if old_price_tag else None
        
        if current_price:
            print('1.', old_price, current_price)
            return {'old_price': old_price, 'current_price': current_price}



    return None


def process_row_elcorteingles(df):
    extracted_data_list = []
    for index, row in df.iterrows():
        link = row['linkToArchive']
        site = row['site'] 
        link = link.replace(f'/https://{site}/', f'mp_/https://{site}/')
        if link == row['linkToArchive']:
            link = link.replace(f'/http://{site}/', f'mp_/http://{site}/')
        tstamp = str(row['tstamp'])
        date = tstamp[:8]
        title = row['title']
        snippet = row['snippet']

        
        if site == 'www.elcorteingles.pt':
            try:
                response = requests.get(link)
                response.raise_for_status()
                text = response.text
            except requests.RequestException:
                print(f"Error while fetching {link}")
                extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': title,
                            'extractedData': None,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
            
            extracted_data = extract_prices_elcorteingles(text, site)
            
            if extracted_data:

                old = extracted_data['old_price'] if 'old_price' in extracted_data else None
                current = extracted_data['current_price'] if 'current_price' in extracted_data else None
                price = [current, old] if old and current else [current]
                
                extracted_data_list.append({
                    'site': site,
                    'date': date,
                    'title': title,
                    'extractedData': price,
                    'linkToArchive': link,
                    'linkToExtractedText': row['linkToExtractedText'],
                    'snippet': snippet
                })

            else:
                extracted_data_list.append({
                    'site': site,
                    'date': date,
                    'title': title,
                    'extractedData': None,
                    'linkToArchive': link,
                    'linkToExtractedText': row['linkToExtractedText'],
                    'snippet': snippet
                })
    
    return extracted_data_list


In [None]:
df = pd.read_csv('data/smartphones_arquivo.csv')        
result_list = process_row_elcorteingles(df)               
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']
df_result.to_csv('data/products_extracted_prices_ELCORTEINGLES.csv', index=False)

1. None 259,90€
1. None 259,90€
1. None 699,90€
1. 799,90€ 559,90€
1. 799,90€ 619,99€
1. 999,90€ 789,99€
1. None 799,90€
1. None 919,90€
1. None 869,90€
1. None 869,90€
1. None 1.029,90€
1. None 929,90€
1. 449€ 399€
1. 449€ 399€
1. None 1.029€
1. None 829€
1. None 829€
1. None 879€
1. None 1.179€
1. None 1.179€
1. None 1.359€
1. None 1.179€
1. None 1.179€
1. None 1.179€
1. None 1.179€
1. None 1.279€
Error while fetching https://arquivo.pt/wayback/20190422182330mp_/https://www.elcorteingles.pt/tecnologia/A27722117-smartphone-apple-iphone-xr-64gb---black/?utm_source=kuantokusta&utm_medium=cpc&utm_campaign=catalogo
1. None 879€
1. None 939€
1. None 349,90€
1. None 349,90€
1. 69,99€ 49,99€
1. 69,99€ 44,99€
1. None 69,99€
1. 44,99€ 29,99€


In [None]:
df = pd.read_csv('data/resultados_black_friday_arquivo.csv')
result_list = process_row_elcorteingles(df)
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']
df_result.to_csv('data/products_extracted_prices_ELCORTEINGLES_Black_Friday.csv', index=False)

1. 22,99€ 17,24€
1. 22,99€ 17,24€
1. 69,99€ 49,99€
1. 69,99€ 49,99€
1. 299,99€ 199€
1. None 10€
1. 108,80€ 81,60€
1. 108,80€ 81,60€
1. None 69,99€
1. 44,99€ 29,99€
1. 39,99€ 9,99€
1. 39,99€ 9,99€
1. 108,80€ 81,60€
1. 108,80€ 81,60€


# Staples Product Prices

In [102]:

def extract_prices_staples(text, site):
    soup = BeautifulSoup(text, 'html.parser')

    price_label = soup.find("div", id="PriceLabel")
    if price_label:
        current_price_tag = price_label.find("span", class_="priceupdate")
        current_price = current_price_tag.get_text(strip=True) if current_price_tag else None
        
        if current_price:
            print('1.', current_price)
            return {'current_price': current_price}


    return None


def process_row_staples(df):
    extracted_data_list = []
    for index, row in df.iterrows():
        link = row['linkToArchive']
        site = row['site'] 
        link = link.replace(f'/https://{site}/', f'mp_/https://{site}/')
        if link == row['linkToArchive']:
            link = link.replace(f'/http://{site}/', f'mp_/http://{site}/')
        tstamp = str(row['tstamp'])
        date = tstamp[:8]
        title = row['title']
        snippet = row['snippet']

        
        if site == 'www.staples.pt':
            try:
                response = requests.get(link)
                response.raise_for_status()
                text = response.text
            except requests.RequestException:
                print(f"Error while fetching {link}")
                extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': title,
                            'extractedData': None,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
            
            extracted_data = extract_prices_staples(text, site)
            
            if extracted_data:

                old = extracted_data['old_price'] if 'old_price' in extracted_data else None
                current = extracted_data['current_price'] if 'current_price' in extracted_data else None
                price = [current, old] if old and current else [current]
                
                extracted_data_list.append({
                    'site': site,
                    'date': date,
                    'title': title,
                    'extractedData': price,
                    'linkToArchive': link,
                    'linkToExtractedText': row['linkToExtractedText'],
                    'snippet': snippet
                })

            else:
                extracted_data_list.append({
                    'site': site,
                    'date': date,
                    'title': title,
                    'extractedData': None,
                    'linkToArchive': link,
                    'linkToExtractedText': row['linkToExtractedText'],
                    'snippet': snippet
                })
    
    return extracted_data_list


In [None]:
df = pd.read_csv('data/smartphones_arquivo.csv')        
result_list = process_row_staples(df)               
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']
df_result.to_csv('data/products_extracted_prices_STAPLES.csv', index=False)

1. 649,00€
1. 649,00€
1. 633,33
1. 779,67
1. 820,33
1. 673,98
1. 16,25
1. 16,25


Nao encontrei produtos para a campanha black friday da staples, por isso nao extrai os preços dos produtos.

# PcDiga Product Prices

In [17]:

def extract_prices_pcdiga(text, site):
    soup = BeautifulSoup(text, 'html.parser')

    product_cards = soup.find_all("div", class_="product_special_container_fixer")
    if product_cards:
        products = []
        for card in product_cards:
            # Extract product name
            name_tag = card.find("div", class_="product-card--title")
            name = name_tag.get_text(strip=True) if name_tag else None

            # Extract product price
            price_tag = card.find("div", class_="value--current-price").find("span", class_="price")
            price = price_tag.get_text(strip=True) if price_tag else None
            print('1.', name, price)
            if name and price:
                products.append({
                    'product_name': name,
                    'current_price': price
                })

        return products 
    
    price_label = soup.find("span", class_="new-price-label")
    if price_label:
        current_price = price_label.get_text(strip=True)
        
        if current_price:
            print('2.', current_price)
            return {'current_price': current_price}

    price_container = soup.find("div", class_="value--current-price")
    if price_container:
        span = price_container.find("span", class_="price")
        current_price = span.get_text(strip=True) if span else None
        print('3.', current_price)  
        return {'current_price': current_price}
    
    products = []
    product_items = soup.find_all("li", class_="item last")
    for item in product_items:
        name_tag = item.find("h2", class_="product-name")
        price_tag = item.find("span", class_="price")

        product_name = name_tag.get_text(strip=True) if name_tag else None
        current_price = price_tag.get_text(strip=True) if price_tag else None
        print('4.', product_name, current_price)
        if product_name and current_price:
            products.append({
                'product_name': product_name,
                'current_price': current_price
            })

    return products if products else None



def process_row_pcdiga(df):
    extracted_data_list = []
    for index, row in df.iterrows():
        link = row['linkToArchive']
        site = row['site'] 
        link = link.replace(f'/https://{site}/', f'mp_/https://{site}/')
        tstamp = str(row['tstamp'])
        date = tstamp[:8]
        title = row['title']
        snippet = row['snippet']

        
        if site == 'www.pcdiga.com':
            try:
                response = requests.get(link)
                response.raise_for_status()
                text = response.text
            except requests.RequestException:
                print(f"Error while fetching {link}")
                extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': title,
                            'extractedData': None,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
            
            extracted_data = extract_prices_pcdiga(text, site)
            
            if extracted_data:
                if isinstance(extracted_data, list):
                    for product in extracted_data:
                        old = product['old_price'] if 'old_price' in product else None
                        current = product['current_price'] if 'current_price' in product else None
                        price = [current, old] if old and current else [current]
                        titulo = product['product_name'] if 'product_name' in product else None
                        extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': titulo,
                            'extractedData': price,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
                else:
                    old = extracted_data['old_price'] if 'old_price' in extracted_data else None
                    current = extracted_data['current_price'] if 'current_price' in extracted_data else None
                    price = [current, old] if old and current else [current]
                    
                    extracted_data_list.append({
                        'site': site,
                        'date': date,
                        'title': title,
                        'extractedData': price,
                        'linkToArchive': link,
                        'linkToExtractedText': row['linkToExtractedText'],
                        'snippet': snippet
                    })

            else:
                extracted_data_list.append({
                    'site': site,
                    'date': date,
                    'title': title,
                    'extractedData': None,
                    'linkToArchive': link,
                    'linkToExtractedText': row['linkToExtractedText'],
                    'snippet': snippet
                })
    
    return extracted_data_list


In [None]:
df = pd.read_csv('data/smartphones_arquivo.csv')        
result_list = process_row_pcdiga(df)               
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']
df_result.to_csv('data/products_extracted_prices_PcDiga.csv', index=False)

2. 39,90€
2. 44,90€
2. 1 019,00€
2. 1 019,00€
3. 929,90€
3. 929,90€
2. 469,90€
2. 649,00€
2. 619,00€
2. 619,00€
2. 1 199,00€
2. 1 349,00€
2. 1 049,00€
2. 799,00€
2. 999,00€
2. 1 349,00€
2. 1 179,00€
2. 1 349,00€
2. 1 349,00€
2. 1 349,00€
2. 1 179,00€
2. 1 349,00€
2. 1 049,00€
2. 799,00€
2. 729,00€
2. 1 049,00€
3. 409,90€
3. 459,90€
4. Consola Microsoft Xbox One 500 GB (5C5-00013) 239,90 €
4. Microsoft Xbox One 1TB Rainbow Six Siege 269,90 €
4. Consola Microsoft Xbox One S 500GB + Battlefield 1 289,90 €
4. Microsoft Xbox One 500GB + Quantum Break 299,90 €
4. Consola Microsoft Xbox One S 500GB + Fifa 17 299,90 €
4. Microsoft Xbox One 1TB Tom Clancy's The Division 349,90 €
4. Consola Microsoft Xbox One 500GB+Kinect+Assassin's Creed 389,90 €
4. Consola Microsoft Xbox One 1TB Call of Duty: Advanced Warfare 489,90 €
4. Microsoft Xbox One 1TB Rainbow Six Siege 269,90 €
4. Microsoft Xbox One 500GB + Quantum Break 299,90 €
4. Microsoft Xbox One 1TB Tom Clancy's The Division 349,90 €
4. Consola 

In [None]:
df = pd.read_csv('data/resultados_black_friday_arquivo.csv')
result_list = process_row_pcdiga(df)
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']
df_result.to_csv('data/products_extracted_prices_PcDiga_Black_Friday.csv', index=False)

1. Smartphone Oppo A15 6.52" 3GB/32GB Dual SIM Mystery Blue 169,90€
1. Smartphone Oppo A15 6.52" 3GB/32GB Dual SIM Dynamic Black 169,90€
1. Smartphone OnePlus Nord N100 6.52" 4GB/64GB Dual SIM Midnight Frost 199,00€
1. Smartphone TCL 10 Plus 6.47" 6GB/256GB Dual SIM Azul 399,90€
1. Smartphone TCL 10 Plus 6.47" 6GB/256GB Dual SIM Cinzento 399,90€
1. Película de Protecção 3MK Antimicrobiana Silver Protection+ Xiaomi Redmi Note 9 11,90€
1. Smartphone Huawei Mate 40 Pro 6.76" 8GB/256GB Dual SIM Prateado 1249,00€
1. Smartphone Oppo A53 6.5" 4GB/64GB Dual SIM Mint Cream 189,00€
1. Smartphone Oppo A53 6.5" 4GB/64GB Dual SIM Electric Black 189,00€
1. Smartphone Oppo A53s 6.5" 4GB/128GB Dual SIM Fancy Blue 219,00€
1. Smartphone Oppo A53s 6.5" 4GB/128GB Dual SIM Electric Black 219,00€
1. Smartphone Oppo Reno 4 Z 5G 6.57" 8GB/128GB Dual SIM Dew White 399,00€
1. Smartphone Oppo Reno 4 Z 5G 6.57" 8GB/128GB Dual SIM Ink Black 399,00€
1. Smartphone Oppo Reno 4 5G 6.43" 8GB/128GB Dual SIM Space Black 

# Radio Popular Product Prices

In [None]:

def extract_prices_radiopopular(text, site):
    soup = BeautifulSoup(text, 'html.parser')

    old_price_tag = soup.find("div", class_="old-price notranslate")
    old_price = old_price_tag.get_text(strip=True) if old_price_tag else None

    price_tag = soup.find("div", class_="price notranslate")

    if price_tag:
        current_price = price_tag.get_text(strip=True)
        print('1.', old_price, current_price)
        return {'old_price': old_price, 'current_price': current_price}


def process_row_radiopopular(df):
    extracted_data_list = []
    for index, row in df.iterrows():
        link = row['linkToArchive']
        site = row['site'] 
        link = link.replace(f'/https://{site}/', f'mp_/https://{site}/')
        tstamp = str(row['tstamp'])
        date = tstamp[:8]
        title = row['title']
        snippet = row['snippet']

        
        if site == 'www.radiopopular.pt':
            try:
                response = requests.get(link)
                response.raise_for_status()
                text = response.text
            except requests.RequestException:
                print(f"Error while fetching {link}")
                extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': title,
                            'extractedData': None,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
            
            extracted_data = extract_prices_radiopopular(text, site)
            
            if extracted_data:
                if isinstance(extracted_data, list):
                    for product in extracted_data:
                        old = product['old_price'] if 'old_price' in product else None
                        current = product['current_price'] if 'current_price' in product else None
                        price = [current, old] if old and current else [current]
                        titulo = product['product_name'] if 'product_name' in product else None
                        extracted_data_list.append({
                            'site': site,
                            'date': date,
                            'title': titulo,
                            'extractedData': price,
                            'linkToArchive': link,
                            'linkToExtractedText': row['linkToExtractedText'],
                            'snippet': snippet
                        })
                else:
                    old = extracted_data['old_price'] if 'old_price' in extracted_data else None
                    current = extracted_data['current_price'] if 'current_price' in extracted_data else None
                    price = [current, old] if old and current else [current]
                    
                    extracted_data_list.append({
                        'site': site,
                        'date': date,
                        'title': title,
                        'extractedData': price,
                        'linkToArchive': link,
                        'linkToExtractedText': row['linkToExtractedText'],
                        'snippet': snippet
                    })

            else:
                extracted_data_list.append({
                    'site': site,
                    'date': date,
                    'title': title,
                    'extractedData': None,
                    'linkToArchive': link,
                    'linkToExtractedText': row['linkToExtractedText'],
                    'snippet': snippet
                })
    
    return extracted_data_list


In [None]:
df = pd.read_csv('data/resultados_black_friday_arquivo.csv')
result_list = process_row_radiopopular(df)
df_result = pd.DataFrame(result_list)
df_result.columns = ['site', 'date', 'title', 'extractedData', 'linkToArchive', 'linkToExtractedText', 'snippet']
df_result.to_csv('data/products_extracted_prices_RadioPopular_Black_Friday.csv', index=False)

1. 719,99 666,99
1. 1.129,00 846,75
1. 1.129,00 846,75
1. 99,00 22,00
