# Digital Shift: The Evolution of Products and Platforms in Portuguese E-commerce



### Data Processor:



In [22]:
sites = ["www.fnac.pt", "www.worten.pt", "www.elcorteingles.pt", "www.radiopopular.pt", "www.staples.pt", "www.pcdiga.com"]

In [3]:
import requests
import urllib.parse
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

 ## Extracting product categories from the websites (2007 | 2010 | 2015 | 2020 | 2023)

We will use the BeautifulSoup library to extract the product categories from the websites, and we will use the requests library to get the HTML content of the websites.
We had to create a different script for each year and site, because the structure of the sites is different for each year and site.

In [4]:
df = pd.read_csv("data/sites_links.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,www.fnac.pt,www.worten.pt,www.elcorteingles.pt,www.radiopopular.pt,www.staples.pt,www.pcdiga.com
0,2005,,,https://arquivo.pt/wayback/20050725031922/http...,https://arquivo.pt/wayback/20050722223614/http...,https://arquivo.pt/wayback/20050722174753/http...,https://arquivo.pt/wayback/20050719124815/http...
1,2006,https://arquivo.pt/wayback/20061118120805/http...,,https://arquivo.pt/wayback/20060216170739/http...,,,
2,2007,https://arquivo.pt/wayback/20070928223117/http...,https://arquivo.pt/wayback/20070611190104/http...,https://arquivo.pt/wayback/20070929080902/http...,https://arquivo.pt/wayback/20070929122436/http...,https://arquivo.pt/wayback/20070610185333/http...,
3,2008,https://arquivo.pt/wayback/20081027081756/http...,https://arquivo.pt/wayback/20081022044251/http...,https://arquivo.pt/wayback/20081021184312/http...,https://arquivo.pt/wayback/20081022013802/http...,https://arquivo.pt/wayback/20081022031557/http...,https://arquivo.pt/wayback/20081022130111/http...
4,2009,https://arquivo.pt/wayback/20091218064527/http...,https://arquivo.pt/wayback/20091218174523/http...,https://arquivo.pt/wayback/20091218054927/http...,https://arquivo.pt/wayback/20091218134419/http...,https://arquivo.pt/wayback/20091218154357/http...,https://arquivo.pt/wayback/20091219171727/http...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            19 non-null     int64 
 1   www.fnac.pt           16 non-null     object
 2   www.worten.pt         14 non-null     object
 3   www.elcorteingles.pt  16 non-null     object
 4   www.radiopopular.pt   17 non-null     object
 5   www.staples.pt        18 non-null     object
 6   www.pcdiga.com        16 non-null     object
dtypes: int64(1), object(6)
memory usage: 1.2+ KB


In [None]:
print(df.isna().sum())
df.isna()

Unnamed: 0              0
www.fnac.pt             3
www.worten.pt           5
www.elcorteingles.pt    3
www.radiopopular.pt     2
www.staples.pt          1
www.pcdiga.com          3
dtype: int64


Unnamed: 0.1,Unnamed: 0,www.fnac.pt,www.worten.pt,www.elcorteingles.pt,www.radiopopular.pt,www.staples.pt,www.pcdiga.com
0,False,True,True,False,False,False,False
1,False,False,True,False,True,True,True
2,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False
7,False,False,False,False,False,False,True
8,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False


In [None]:
df.columns

Index(['Unnamed: 0', 'www.fnac.pt', 'www.worten.pt', 'www.elcorteingles.pt',
       'www.radiopopular.pt', 'www.staples.pt', 'www.pcdiga.com'],
      dtype='object')

In [None]:
header = ['ano', 'link', 'site', 'numero_categorias', 'lista_categorias', 'dicionario_subcategorias']
header_df = pd.DataFrame(columns=header)
header_df.to_csv("data/ecommerce_category_analysis_all.csv", index=False)


WORTEN

In [None]:
def extractor_2007_worten(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('ul', class_='menu')

        categories_dict = {}

        for category in categories_container.find_all('li'):
            main_category_name = category.find('a').get_text(strip=True)
            
        # nao tem subcategorias
            categories_dict[main_category_name] = []

        return len(categories_dict), categories_dict
    except Exception as e:
        return 0, {}
    
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year <= 2012:
                link = link + 'Splash.aspx'
            if year == 2007:
                num_categories, category_dict = extractor_2007_worten(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2010_worten(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', class_='menu menu-category')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='sub'):
                main_category_name = category.find('a', class_='label').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li'):
                        # Extract subcategory name
                        subcategory_name = subcategory.find('a', class_='label').get_text(strip=True)
                        subcategories.append(subcategory_name)

                categories_dict[main_category_name] = subcategories

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            link = link + 'default.aspx'    
            if year == 2010:
                num_categories, category_dict = get_categories_2010_worten(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2015_worten(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', id='nav')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='level1'):
                main_category_name = category.find('a').find('span').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li', class_='level2'):
                        # Extract subcategory name
                        subcategory_name = subcategory.find('span').get_text(strip=True)
                        subcategories.append(subcategory_name)
                categories_dict[main_category_name] = subcategories

            # se houver subcategorias com [] é porque não tem subcategorias, e eliminar categoria
            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
                    
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (year not in category_analysis['ano'].values or site_column not in category_analysis['site'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_worten(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)




In [None]:
def get_categories_worten_2020_23(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')

            categories_container = soup.find('ul', class_='nav-sub js-nav-sub')

            categories_dict = {}

            for category in categories_container.find_all('li', class_='nav-item nav-item-sub'):
                
                main_category_name = category.find('span', class_='nav-a').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul', class_='nav nav-sub nav-sub-child js-nav-sub')
                for sub in subcategories_container.find_all('li', class_='nav-item-sub'):
                    label = sub.find('label', class_='nav-trigger js-nav-trigger')
                    if label:
                        subcategory_name = label.find('a', class_='nav-a')
                        if subcategory_name:
                            subcategories.append(subcategory_name.get_text(strip=True).lower())

                categories_dict[main_category_name] = subcategories

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}

        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.worten.pt' and (year not in category_analysis['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2020 or year == 2023:
                num_categories, category_dict = get_categories_worten_2020_23(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)




Staples Extracted Categories:

In [None]:
def get_categories_staples_2007_10(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('span', id= lambda x: x and 'categorias' in x.lower())

        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('div', class_='tracos_centro'):
            a_tag = div.find('a')
            if not a_tag:
                continue

            category_name = a_tag.get_text(strip=True)

            if 'tit_centro_blue_bold' in a_tag.get('class', []):
                # Main category
                current_main_category = category_name
                categories_dict[current_main_category] = []
            else:
                # Subcategory
                if current_main_category:
                    categories_dict[current_main_category].append(
                         category_name
                    )

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}


category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            link += 'default.aspx'
            if year == 2007 or year == 2010:
                num_categories, category_dict = get_categories_staples_2007_10(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_staples_15(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='primaryNav')

        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('span', class_='navItem'):
            current_main_category = div.find('a', class_='navLink').get_text(strip=True)
            categories_dict[current_main_category] = []

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2015:
                num_categories, category_dict = get_categories_staples_15(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_staples_20(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='primaryNav')
        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('span', class_='navItem'):
            current_main_category = div.find('a', class_='navLink').get_text(strip=True)
            categories_dict[current_main_category] = []


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2020:
                num_categories, category_dict = get_categories_staples_20(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_staples_23(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        # span com id que contem a palavra categorias
        categories_container = soup.find('div', class_='container-menu-children')
        print(categories_container)
        categories_dict = {}

        current_main_category = None
        for div in categories_container.find_all('div', class_='children-cont'):
            main_category_tag = div.find('span', class_='pr-name')
            if main_category_tag:
                current_main_category = main_category_tag.get_text(strip=True)
                categories_dict[current_main_category] = []
            else:
                if current_main_category:
                    subcategory_link = div.find('a', class_='menu-link_a')
                    if subcategory_link:
                        subcategory_name = subcategory_link.get_text(strip=True)
                        categories_dict[current_main_category].append(subcategory_name)


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.staples.pt' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2023:
                num_categories, category_dict = get_categories_staples_23(link)

                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

PcDiga Extracted Categories:
- As it is not possible to extract categories from 2007, we will extract from 2008, assuming that the differences between 2007 and 2008 are minimal.
- As it is not possible to extract categories from 2023, we assume that the differences between 2020 and 2023 are minimal, or probably the same.


In [None]:
def get_categories_pcdiga_08(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('map', attrs={'name': 'Map'})

        categories_dict = {}

        for category in categories_container.find_all('area'):
            # extrair o nome do href = ...?Familia=nome
            category_href = category
            main_category_name = category.get('href').split('=')[-1]

            subcategories = []

            # a categoria principal não tem subcategorias
            categories_dict[main_category_name] = subcategories


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            link += 'pcdiga/'
            if year == 2008 or year == 2010:
                n_year = year - 1 if year == 2008 else year
                num_categories, category_dict = get_categories_pcdiga_08(link)

                category_data.append({
                    'ano': n_year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



In [None]:
def get_categories_pcdiga_15(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories_container = soup.find('div', id='masterdiv')

        categories_dict = {}

        for category in categories_container.find_all('table', class_='menu1'):
            main_category_name = category.find('a', class_='menu').get_text(strip=True)
            subcategories = []
            categories_dict[main_category_name] = subcategories


        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2015:

                num_categories, category_dict = get_categories_pcdiga_15(link)
                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



In [None]:
def get_categories_pcdiga_20(url):
    try:
        request = requests.get(url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')

        categories = soup.find('div', class_='megamenu-wrapper')
        categories_container = categories.find('ul', class_='megamenu')
        categories_dict = {}

        for category in categories_container.find_all('li'):
            main_category_link = category.find('a', class_='i-link')
            svg_element = category.find('span', class_='svg-i svg-arrow')
            if not main_category_link or not svg_element:
                continue  

            main_category_name = main_category_link.find('span', class_='lnk-text').get_text(strip=True)

            subcategories = []
            subcategories_container = category.find('div', class_='submenu')
            if subcategories_container:
                for subcategory in subcategories_container.find_all('li'):
                    subcategory_link = subcategory.find('a', class_='i-link')
                    if subcategory_link:
                        subcategory_name = subcategory_link.find('span').get_text(strip=True)
                        subcategories.append(subcategory_name)  

            categories_dict[main_category_name] = subcategories

        return len(categories_dict), categories_dict
    except Exception as e:
        print(f"Error while extracting categories: {e}")
        return 0, {}

category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.pcdiga.com' and (year not in category_analysis[category_analysis['site'] == site_column]['ano'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/') 
            if year == 2020:
                num_categories, category_dict = get_categories_pcdiga_20(link)
                category_data.append({
                    'ano': year,
                    'link': link,
                    'site': site_column,
                    'numero_categorias': num_categories,
                    'lista_categorias': list(category_dict.keys()),
                    'dicionario_subcategorias': category_dict
                })

        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)



RadioPopular Extracted Categories:


In [None]:
def get_categories_2010_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            for category in soup.find_all('td', align='left'):
                a = category.find('a')
                if a:
                    main_category_name = a.find('img').get('alt')
                    categories_dict[main_category_name] = []
            for i in list(categories_dict.keys()):
                if i == 'Recrutamento':
                    del categories_dict[i]
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2010 or year == 2007:
                num_categories, category_dict = get_categories_2010_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2015_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            # Encontrar o div com as categorias principais
            div = soup.find('div', id='nav')
            if div is None:
                return 0, {} 

            for category in div.find_all('li', class_='dir'):
                main_category_name = category.find('a').get_text(strip=True)
                subcategories = []

                subcategories_container = category.find('ul')
                if subcategories_container:
                    for subcategory in subcategories_container.find_all('li', class_='dir'):
                        subcategory_name = subcategory.find('a').get_text(strip=True)
                        subcategories.append(subcategory_name)

                if subcategories:
                    categories_dict[main_category_name] = subcategories
                else:
                    categories_dict[main_category_name] = []

            # vamos ver todas as keys que tem [] e eliminar, pois não tem subcategorias
            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
                    
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2020_rp(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            categories_dict = {}

            # Encontrar o div com as categorias principais
            ul = soup.find('ul', class_='categories')

            for i in ul.find_all('li', class_='category link cb'):
                main_category_name = i.find('a').get_text(strip=True)
                subcategories = []
                div = i.find('div', class_='subcategories')
                if div:
                    for subcategory in div.find_all('li', class_="subcategory family link"):
                        subcategory_name = subcategory.get_text(strip=True)
                        subcategories.append(subcategory_name)
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.radiopopular.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')    
            if year == 2020 or year == 2023:
                num_categories, category_dict = get_categories_2020_rp(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

As elcorreioingles does not have product categories, we will not use this site to make this comparison

Fnac Extracted Categories:


As is not possible to extract categories, from 2006 and 2008, we will not use the year 2007 in fnac to make the comparison

Error message presented:```O seu browser não aceita cookies, pelo que não é possível o acesso ao nosso site.```


In [None]:
def get_categories_2010_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', id='menu')
            u = div.find('ul', style="margin-left: 40px")
            categories_dict = {}
            main_category_name = None
            for category in u.find_all('li'):
                a = category.find('a')
                if a:
                    main_category_name = a.get_text(strip=True)
                subcategories = []
                ul = category.find('ul')
                if ul:
                    for subcategory in ul.find_all('li'):
                        subcategory_name = subcategory.find('a').get_text(strip=True)
                        subcategory_name = subcategory_name.replace('»', '').replace('\r\n', '').strip()
                        subcategory_name = ' '.join(subcategory_name.split())
                        subcategories.append(subcategory_name)
                    if subcategories:
                        categories_dict[main_category_name] = subcategories
                    else:
                        categories_dict[main_category_name] = []

            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]
            for value in categories_dict.values():
                if 'Ver todos os produtos' in value:
                    value.remove('Ver todos os produtos')

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2010:
                num_categories, category_dict = get_categories_2010_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2015_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', id='MENU')
            u = div.find('ul', id="onglets")
            categories_dict = {}
            main_category_name = None
            for category in u.find_all('li'):
                a = category.find('a')
                if a:
                    span = a.find('span', class_='inner')
                    main_category_name = a.get_text(strip=True)
                subcategories = []
                
                div = category.find('div', class_='megaMenu')
                if div:
                    for sub in div.find_all('dt'):
                        # pode ja ter subcategoria ou ter um <a> com a subcategoria
                        subcategory_name = sub.get_text(strip=True)
                        if subcategory_name:
                            subcategories.append(subcategory_name)

                    if subcategories:
                        categories_dict[main_category_name] = subcategories
                    else:
                        categories_dict[main_category_name] = []

            for key in list(categories_dict.keys()):
                if len(categories_dict[key]) == 0:
                    del categories_dict[key]

            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2015:
                num_categories, category_dict = get_categories_2015_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2020_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', class_='Sidebar-nano nano')
            categories_dict = {}
            main_category_name = None

            ul = div.find('ul', class_="CategoryNav js-CategoryNav")
            for il in ul.find_all('li', class_='CategoryNav-item js-CategoryNav-item'):
                main_category_name = il.find('a').get_text(strip=True)
                subcategories = []  
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2020:
                num_categories, category_dict = get_categories_2020_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)

In [None]:
def get_categories_2023_fnac(url):
        try:
            request = requests.get(url)
            request.raise_for_status()

            soup = BeautifulSoup(request.text, 'html.parser')
            div = soup.find('div', class_='SideNavPanel-listWrapper')
            categories_dict = {}
            main_category_name = None

            ul = div.find('ul', class_="SideNavPanel-list")
            for il in ul.find_all('li', class_='SideNavPanel-listItem js-SideNavPanel-listItem'):
                main_category_name = il.find('a').get_text(strip=True)
                subcategories = []  
                categories_dict[main_category_name] = subcategories
            return len(categories_dict), categories_dict
        except Exception as e:
            return 0, {}
        
category_data = []

data = pd.read_csv('data/sites_links.csv')
category_analysis = pd.read_csv('data/ecommerce_category_analysis_all.csv')

for index, row in data.iterrows():
    year = row['Unnamed: 0']
    
    for site_column in data.columns[1:]:
        link = row[site_column]
        if pd.notna(link) and site_column == 'www.fnac.pt' and (link not in category_analysis['link'].values):
            link = link.replace(f'/http://{site_column}/', f'mp_/http://{site_column}/')      
            if year == 2023:

                num_categories, category_dict = get_categories_2023_fnac(link)
                if num_categories > 0:
                    category_data.append({
                        'ano': year,
                        'link': link,
                        'site': site_column,
                        'numero_categorias': num_categories,
                        'lista_categorias': list(category_dict.keys()),
                        'dicionario_subcategorias': category_dict
                    })
        else:
            continue

# Adicionar os dados coletados ao csv
category_data_df = pd.DataFrame(category_data)
category_data_df.to_csv("data/ecommerce_category_analysis_all.csv", mode='a', header=False, index=False)