# Digital Shift: The Evolution of Products and Platforms in Portuguese E-commerce



### Data Extraction:



In [1]:
sites = ["www.fnac.pt", "www.worten.pt", "www.elcorteingles.pt", "www.radiopopular.pt", "www.staples.pt", "www.pcdiga.com"]

In [2]:
import requests
import urllib.parse
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
## Função para procurar no arquivo.pt
def arquivo_search(query=None, max_items=500, from_year=None, to_year=None, site=None, doc_type=None, version_history_url=None):
    if version_history_url:
        encoded_url = urllib.parse.quote(version_history_url, safe='')
        base_url = f"https://arquivo.pt/textsearch?versionHistory={encoded_url}"
    else:
        base_url = "https://arquivo.pt/textsearch?q=" + urllib.parse.quote(query)  
    
    if from_year and to_year:
        base_url += f"&from={from_year}&to={to_year}"
    
    if site:
        base_url += f"&siteSearch={site}"
    
    if doc_type:
        base_url += f"&type={doc_type}"
    
    base_url += f"&maxItems={max_items}&prettyPrint=false"
    
    response = requests.get(base_url)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro na requisição: {response.status_code}")
        return None

Extracting websites links from Arquivo.pt (2005 to 2023) for the following retailers:
worten, fnac, radio popular, el corte ingles, pc diga, staples

In [4]:
# Extrair a pagina principal de cada um dos sites, ao longo dos anos (2004 a 2023)
def process_sites(sites):
    site_data = {}

    for site in sites:
        print(f"Processando site: {site}")
        site_links_by_year = {year: None for year in range(2005, 2024)}
        
        for year in range(2005, 2024):
            
            #procura pelo version history
            version_history_url = f"http://{site}/"
            response = arquivo_search(version_history_url=version_history_url, from_year=year, to_year=year + 1)

            if response and 'response_items' in response:
                response_items = response['response_items']

                for item in response_items:
                    item_year = int(item['tstamp'][0:4]) # os primeiros 4 caracteres representam o ano

                    if item_year == year and item['originalURL'] == version_history_url:
                        if site_links_by_year[year] is None:
                            site_links_by_year[year] = item['linkToArchive']
                        else:
                            break
        
        site_data[site] = site_links_by_year
    
    return site_data



In [5]:
dados_sites = process_sites(sites)

print(dados_sites)

Processando site: www.fnac.pt
Processando site: www.worten.pt
Processando site: www.elcorteingles.pt
Processando site: www.radiopopular.pt
Processando site: www.staples.pt
Processando site: www.pcdiga.com
{'www.fnac.pt': {2005: None, 2006: 'https://arquivo.pt/wayback/20061118120805/http://www.fnac.pt/', 2007: 'https://arquivo.pt/wayback/20070928223117/http://www.fnac.pt/', 2008: 'https://arquivo.pt/wayback/20081027081756/http://www.fnac.pt/', 2009: 'https://arquivo.pt/wayback/20091218064527/http://www.fnac.pt/', 2010: 'https://arquivo.pt/wayback/20100804062306/http://www.fnac.pt/', 2011: 'https://arquivo.pt/wayback/20110702090458/http://www.fnac.pt/', 2012: 'https://arquivo.pt/wayback/20120122102914/http://www.fnac.pt/', 2013: 'https://arquivo.pt/wayback/20131106231750/http://www.fnac.pt/', 2014: 'https://arquivo.pt/wayback/20141127075233/http://www.fnac.pt/', 2015: 'https://arquivo.pt/wayback/20151124075844/http://www.fnac.pt/', 2016: 'https://arquivo.pt/wayback/20161106090812/http://

In [6]:
# guardar os dados num ficheiro csv
df = pd.DataFrame(dados_sites)
df.to_csv("data/sites_links.csv")


 ## Extracting product categories from the websites (2007 | 2010 | 2015 | 2020 | 2023)

In [19]:
header = ['ano', 'link', 'site', 'numero_categorias', 'lista_categorias', 'dicionario_subcategorias']
header_df = pd.DataFrame(columns=header)
header_df.to_csv("ecommerce_category_analysis_all.csv", index=False)


WORTEN

In [None]:
def extractor_2007_worten(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('ul', class_='menu')

        categories_dict = {}

        for category in categories_container.find_all('li'):
            main_category_name = category.find('a').get_text(strip=True)
            
        # nao tem subcategorias
            categories_dict[main_category_name] = []

        return len(categories_dict), categories_dict
    except Exception as e:
        return 0, {}
    
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year <= 2012:
                link = link + 'Splash.aspx'
            if year == 2007:
                num_categories, category_dict = extractor_2007_worten(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [23]:
def get_categories_2010_worten(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', class_='menu menu-category')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='sub'):
                main_category_name = category.find('a', class_='label').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li'):
                        # Extract subcategory name
                        subcategory_name = subcategory.find('a', class_='label').get_text(strip=True)
                        subcategories.append(subcategory_name)

                categories_dict[main_category_name] = subcategories

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            link = link + 'default.aspx'    
            if year == 2010:
                num_categories, category_dict = get_categories_2010_worten(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [32]:
def get_categories_2015_worten(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', id='nav')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='level1'):
                main_category_name = category.find('a').find('span').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li', class_='level2'):
                        # Extract subcategory name
                        subcategory_name = subcategory.find('span').get_text(strip=True)
                        subcategories.append(subcategory_name)
                categories_dict[main_category_name] = subcategories

            # se houver subcategorias com [] é porque não tem subcategorias, e eliminar categoria
            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
                    
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (year not in category_analysis['ano'].values or site_column not in category_analysis['site'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_worten(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)




In [86]:
def get_categories_worten_2020_23(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', class_='nav-sub js-nav-sub')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='nav-item nav-item-sub'):
                
                main_category_name = category.find('span', class_='nav-a').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul', class_='nav nav-sub nav-sub-child js-nav-sub')
                for sub in subcategories_container.find_all('li', class_='nav-item-sub'):
                    label = sub.find('label', class_='nav-trigger js-nav-trigger')
                    if label:
                        subcategory_name = label.find('a', class_='nav-a')
                        if subcategory_name:
                            subcategories.append(subcategory_name.get_text(strip=True).lower())

                categories_dict[main_category_name] = subcategories

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}

        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (year not in category_analysis['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2020 or year == 2023:
                num_categories, category_dict = get_categories_worten_2020_23(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)




Staples Extracted Categories:

In [2]:
def get_categories_staples_2007_10(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('span', id= lambda x: x and 'categorias' in x.lower())

        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('div', class_='tracos_centro'):
            a_tag = div.find('a')
            if not a_tag:
                continue

            category_name = a_tag.get_text(strip=True)

            if 'tit_centro_blue_bold' in a_tag.get('class', []):
                # Main category
                current_main_category = category_name
                categories_dict[current_main_category] = []
            else:
                # Subcategory
                if current_main_category:
                    categories_dict[current_main_category].append(
                         category_name
                    )

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}


category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            link += 'default.aspx'
            if year == 2007 or year == 2010:
                num_categories, category_dict = get_categories_staples_2007_10(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [11]:
def get_categories_staples_15(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='primaryNav')

        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('span', class_='navItem'):
            current_main_category = div.find('a', class_='navLink').get_text(strip=True)
            categories_dict[current_main_category] = []

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2015:
                num_categories, category_dict = get_categories_staples_15(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [26]:
def get_categories_staples_20(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='primaryNav')
        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('span', class_='navItem'):
            current_main_category = div.find('a', class_='navLink').get_text(strip=True)
            categories_dict[current_main_category] = []


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2020:
                num_categories, category_dict = get_categories_staples_20(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [23]:
def get_categories_staples_23(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='container-menu-children')
        print(categories_container)
        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('div', class_='children-cont'):
            main_category_tag = div.find('span', class_='pr-name')
            if main_category_tag:
                current_main_category = main_category_tag.get_text(strip=True)
                categories_dict[current_main_category] = []
            else:
                if current_main_category:
                    subcategory_link = div.find('a', class_='menu-link_a')
                    if subcategory_link:
                        subcategory_name = subcategory_link.get_text(strip=True)
                        categories_dict[current_main_category].append(subcategory_name)


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2023:
                num_categories, category_dict = get_categories_staples_23(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

PcDiga Extracted Categories:
- náo é possivel extrair categorias de 2007 entao vamos extrair de 2008
- nao consegui extrair categorias de 2023 nao conseguia aceder ao site assumi, que as diferencças de categorias entre 2020 e 2023 seriam minimas

In [42]:
def get_categories_pcdiga_08(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('map', attrs={'name': 'Map'})

        categories_dict = {}

        for category in categories_container.find_all('area'):
            # extrair o nome do href = ...?Familia=nome
            category_href = category
            main_category_name = category.get('href').split('=')[-1]

            subcategories = []

            # a categoria principal não tem subcategorias
            categories_dict[main_category_name] = subcategories


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            link += 'pcdiga/'
            if year == 2008 or year == 2010:
                n_year = year - 1 if year == 2008 else year
                num_categories, category_dict = get_categories_pcdiga_08(link)

                category_data.append({
                    'ano': n_year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



In [None]:
def get_categories_pcdiga_15(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('div', id='masterdiv')

        categories_dict = {}

        for category in categories_container.find_all('table', class_='menu1'):
            main_category_name = category.find('a', class_='menu').get_text(strip=True)
            subcategories = []
            categories_dict[main_category_name] = subcategories


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2015:

                num_categories, category_dict = get_categories_pcdiga_15(link)
                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



In [25]:
def get_categories_pcdiga_20(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories = soup.find('div', class_='megamenu-wrapper')
        categories_container = categories.find('ul', class_='megamenu')
        categories_dict = {}

        for category in categories_container.find_all('li'):
            main_category_link = category.find('a', class_='i-link')
            svg_element = category.find('span', class_='svg-i svg-arrow')
            if not main_category_link or not svg_element:
                continue  

            main_category_name = main_category_link.find('span', class_='lnk-text').get_text(strip=True)

            subcategories = []
            subcategories_container = category.find('div', class_='submenu')
            if subcategories_container:
                for subcategory in subcategories_container.find_all('li'):
                    subcategory_link = subcategory.find('a', class_='i-link')
                    if subcategory_link:
                        subcategory_name = subcategory_link.find('span').get_text(strip=True)
                        subcategories.append(subcategory_name)  

            categories_dict[main_category_name] = subcategories

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2020:
                num_categories, category_dict = get_categories_pcdiga_20(link)
                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



RadioPopular Extracted Categories:


In [43]:
def get_categories_2010_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            for category in soup.find_all('td', align='left'):
                a = category.find('a')
                if a:
                    main_category_name = a.find('img').get('alt')
                    categories_dict[main_category_name] = []
            for i in list(categories_dict.keys()):
                if i == 'Recrutamento':
                    del categories_dict[i]
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2010 or year == 2007:
                num_categories, category_dict = get_categories_2010_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [48]:
def get_categories_2015_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            # Encontrar o div com as categorias principais
            div = soup.find('div', id='nav')
            if div is None:
                return 0, {} 

            for category in div.find_all('li', class_='dir'):
                main_category_name = category.find('a').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li', class_='dir'):
                        subcategory_name = subcategory.find('a').get_text(strip=True)
                        subcategories.append(subcategory_name)

                if subcategories:
                    categories_dict[main_category_name] = subcategories
                else:
                    categories_dict[main_category_name] = []

            # vamos ver todas as keys que tem [] e eliminar, pois não tem subcategorias
            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
                    
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2020_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            # Encontrar o div com as categorias principais
            ul = soup.find('ul', class_='categories')

            for i in ul.find_all('li', class_='category link cb'):
                main_category_name = i.find('a').get_text(strip=True)
                subcategories = []
                div = i.find('div', class_='subcategories')
                if div:
                    for subcategory in div.find_all('li', class_="subcategory family link"):
                        subcategory_name = subcategory.get_text(strip=True)
                        subcategories.append(subcategory_name)
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')    
            if year == 2020 or year == 2023:
                num_categories, category_dict = get_categories_2020_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

Como o elcorteingles nao tem categorias de produtos, nao vamos utilizar este site para fazer esta comparação


Fnac Extracted Categories:


Devido a nao ser prossivel extrair categorias de 2006 e 2008 nao fazemos comparação com esses anos na fnac

**Erro**: ```O seu browser não aceita cookies, pelo que não é possível o acesso ao nosso site.```


In [22]:
def get_categories_2010_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', id='menu')
            u = div.find('ul', style="margin-left: 40px")
            categories_dict = {}
            main_category_name = None
            for category in u.find_all('li'):
                a = category.find('a')
                if a:
                    main_category_name = a.get_text(strip=True)
                subcategories = []
                ul = category.find('ul')
                if ul:
                    for subcategory in ul.find_all('li'):
                        subcategory_name = subcategory.find('a').get_text(strip=True)
                        subcategory_name = subcategory_name.replace('»', '').replace('\r\n', '').strip()
                        subcategory_name = ' '.join(subcategory_name.split())
                        subcategories.append(subcategory_name)
                    if subcategories:
                        categories_dict[main_category_name] = subcategories
                    else:
                        categories_dict[main_category_name] = []

            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
            for value in categories_dict.values():
                if 'Ver todos os produtos' in value:
                    value.remove('Ver todos os produtos')

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2010:
                num_categories, category_dict = get_categories_2010_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [33]:
def get_categories_2015_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', id='MENU')
            u = div.find('ul', id="onglets")
            categories_dict = {}
            main_category_name = None
            for category in u.find_all('li'):
                a = category.find('a')
                if a:
                    span = a.find('span', class_='inner')
                    main_category_name = a.get_text(strip=True)
                subcategories = []
                
                div = category.find('div', class_='megaMenu')
                if div:
                    for sub in div.find_all('dt'):
                        # pode ja ter subcategoria ou ter um <a> com a subcategoria
                        subcategory_name = sub.get_text(strip=True)
                        if subcategory_name:
                            subcategories.append(subcategory_name)

                    if subcategories:
                        categories_dict[main_category_name] = subcategories
                    else:
                        categories_dict[main_category_name] = []

            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [35]:
def get_categories_2020_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', class_='Sidebar-nano nano')
            categories_dict = {}
            main_category_name = None

            ul = div.find('ul', class_="CategoryNav js-CategoryNav")
            for il in ul.find_all('li', class_='CategoryNav-item js-CategoryNav-item'):
                main_category_name = il.find('a').get_text(strip=True)
                subcategories = []  
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2020:
                num_categories, category_dict = get_categories_2020_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [43]:
def get_categories_2023_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', class_='SideNavPanel-listWrapper')
            categories_dict = {}
            main_category_name = None

            ul = div.find('ul', class_="SideNavPanel-list")
            for il in ul.find_all('li', class_='SideNavPanel-listItem js-SideNavPanel-listItem'):
                main_category_name = il.find('a').get_text(strip=True)
                subcategories = []  
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2023:

                num_categories, category_dict = get_categories_2023_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

https://arquivo.pt/wayback/20230622161513mp_/http://www.fnac.pt/


## Comparação  de diferentes preços em certos produtos em diferentes lojas

In [29]:
def coletar_dados_por_periodo(models, sites, periodos):
    info = []

    # Iterar sobre cada site e modelo para buscar informações no arquivo.pt
    for site in sites:
        print(f"Extraindo dados de {site}")
        for model in models:
            for periodo in periodos:
                print(f"Extraindo dados para {model} de {periodo[0]} a {periodo[1]}")
                data = arquivo_search(query=model, site=site, from_year=periodo[0], to_year=periodo[1])
                
                # Verificar se a resposta contém a chave 'response_items' e se não está vazia
                if data and 'response_items' in data and data['response_items']:
                    # Transformar os dados em um DataFrame do pandas

                    df = pd.DataFrame(data['response_items'])
                    
                    # Verificar se a coluna 'title' existe no DataFrame
                    if 'title' in df.columns:
                        df['site'] = site
                        df['model'] = model
                        df['periodo'] = f"{periodo[0]}-{periodo[1]}"
                        cols = ['site', 'model', 'periodo'] + [col for col in df.columns if col not in ['site', 'model', 'periodo']]
                        df = df[cols]
                        # Filtrar os resultados indesejados (capas, vidros, etc.)
                        df = df[~df['title'].str.contains('capa|vidro|película|selfie', case=False, na=False)]
                        # Filtrar para garantir que o nome do modelo esteja no título
                        df = df[df['title'].str.contains(model, case=False, na=False)]
                        
                        
                        # Adicionar os dados à lista de informações
                        info.append(df)
                    else:
                        print(f"A coluna 'title' não foi encontrada nos dados retornados para '{model}' no site '{site}' de {periodo[0]} a {periodo[1]}.")
                else:
                    print(f"Nenhum resultado encontrado para '{model}' no site '{site}' de {periodo[0]} a {periodo[1]}.")

    # Concatenar todos os DataFrames em um único DataFrame
    if info:
        result_df = pd.concat(info, ignore_index=True)
        return result_df
    else:
        print("Nenhum dado relevante foi encontrado.")
        return None

In [31]:

periodos = [(2005,2009),(2010, 2015), (2016, 2020), (2021, 2024)]

models = [
    'huawei p8', 'huawei p20', 'huawei p30', 'huawei p40', 'huawei p50',
    'samsung galaxy s8', 'samsung galaxy s9', 'samsung galaxy s10', 'samsung galaxy s20', 'samsung galaxy s21', 'samsung galaxy s22',
    'apple iphone 6', 'apple iphone 7', 'apple iphone 8', 'apple iphone x', 'apple iphone xs', 'apple iphone xr', 'apple iphone 11', 'apple iphone 12', 'apple iphone 13', 'apple iphone 14',
    'huawei mate 20', 'huawei mate 30', 'huawei mate 40'
]


# Coletar dados dos modelos nos sites, divididos por período
resultados = coletar_dados_por_periodo(models, sites, periodos)

# Exibir os resultados
if resultados is not None:
    # transforma os resultadosnum csv
    resultados.to_csv('data/resultados.csv', index=False)
else:
    print("Nenhum resultado foi extraído.")

Extraindo dados de www.fnac.pt
Extraindo dados para huawei p8 de 2005 a 2009
Nenhum resultado encontrado para 'huawei p8' no site 'www.fnac.pt' de 2005 a 2009.
Extraindo dados para huawei p8 de 2010 a 2015
Nenhum resultado encontrado para 'huawei p8' no site 'www.fnac.pt' de 2010 a 2015.
Extraindo dados para huawei p8 de 2016 a 2020
Extraindo dados para huawei p8 de 2021 a 2024
Nenhum resultado encontrado para 'huawei p8' no site 'www.fnac.pt' de 2021 a 2024.
Extraindo dados para huawei p20 de 2005 a 2009
Nenhum resultado encontrado para 'huawei p20' no site 'www.fnac.pt' de 2005 a 2009.
Extraindo dados para huawei p20 de 2010 a 2015
Nenhum resultado encontrado para 'huawei p20' no site 'www.fnac.pt' de 2010 a 2015.
Extraindo dados para huawei p20 de 2016 a 2020
Extraindo dados para huawei p20 de 2021 a 2024
Nenhum resultado encontrado para 'huawei p20' no site 'www.fnac.pt' de 2021 a 2024.
Extraindo dados para huawei p30 de 2005 a 2009
Nenhum resultado encontrado para 'huawei p30' no 

KeyboardInterrupt: 