# Digital Shift: The Evolution of Products and Platforms in Portuguese E-commerce



### Data Extraction:



In [9]:
sites = ["www.fnac.pt", "www.worten.pt", "www.elcorteingles.pt", "www.radiopopular.pt", "www.staples.pt", "www.pcdiga.com"]

In [32]:
import requests
import urllib.parse
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

Function to extract data from the arquivo.pt API. The function receives a query and returns the response of the API in JSON format.

In [4]:
def arquivo_search(query=None, max_items=500, from_year=None, to_year=None, site=None, doc_type=None, version_history_url=None):
    if version_history_url:
        encoded_url = urllib.parse.quote(version_history_url, safe='')
        base_url = f"https://arquivo.pt/textsearch?versionHistory={encoded_url}"
    else:
        base_url = "https://arquivo.pt/textsearch?q=" + urllib.parse.quote(query)  
    
    if from_year and to_year:
        base_url += f"&from={from_year}&to={to_year}"
    
    if site:
        base_url += f"&siteSearch={site}"
    
    if doc_type:
        base_url += f"&type={doc_type}"
    
    base_url += f"&maxItems={max_items}&prettyPrint=false"
    
    response = requests.get(base_url)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro na requisição: {response.status_code}")
        return None

Extracting websites links from Arquivo.pt (2005 to 2023) for the following retailers:
worten, fnac, radio popular, el corte ingles, pc diga, staples

In [4]:
def process_sites(sites):
    site_data = {}

    for site in sites:
        print(f"Processando site: {site}")
        site_links_by_year = {year: None for year in range(2005, 2024)}
        
        for year in range(2005, 2024):
            
            #procura pelo version history
            version_history_url = f"http://{site}/"
            response = arquivo_search(version_history_url=version_history_url, from_year=year, to_year=year + 1)

            if response and 'response_items' in response:
                response_items = response['response_items']

                for item in response_items:
                    item_year = int(item['tstamp'][0:4]) # os primeiros 4 caracteres representam o ano

                    if item_year == year and item['originalURL'] == version_history_url:
                        if site_links_by_year[year] is None:
                            site_links_by_year[year] = item['linkToArchive']
                        else:
                            break
        
        site_data[site] = site_links_by_year
    
    return site_data



In [5]:
dados_sites = process_sites(sites)

print(dados_sites)

Processando site: www.fnac.pt
Processando site: www.worten.pt
Processando site: www.elcorteingles.pt
Processando site: www.radiopopular.pt
Processando site: www.staples.pt
Processando site: www.pcdiga.com
{'www.fnac.pt': {2005: None, 2006: 'https://arquivo.pt/wayback/20061118120805/http://www.fnac.pt/', 2007: 'https://arquivo.pt/wayback/20070928223117/http://www.fnac.pt/', 2008: 'https://arquivo.pt/wayback/20081027081756/http://www.fnac.pt/', 2009: 'https://arquivo.pt/wayback/20091218064527/http://www.fnac.pt/', 2010: 'https://arquivo.pt/wayback/20100804062306/http://www.fnac.pt/', 2011: 'https://arquivo.pt/wayback/20110702090458/http://www.fnac.pt/', 2012: 'https://arquivo.pt/wayback/20120122102914/http://www.fnac.pt/', 2013: 'https://arquivo.pt/wayback/20131106231750/http://www.fnac.pt/', 2014: 'https://arquivo.pt/wayback/20141127075233/http://www.fnac.pt/', 2015: 'https://arquivo.pt/wayback/20151124075844/http://www.fnac.pt/', 2016: 'https://arquivo.pt/wayback/20161106090812/http://


## Comparahion of different prices of certain products in different stores
worten, fnac, radio popular, el corte ingles, pc diga, staples


In [4]:
def coletar_dados_por_periodo(models, sites, periodos):
    info = []
    for site in sites:
        print(f"Extraindo dados de {site}")
        for model in models:
            for periodo in periodos:
                data = arquivo_search(query=model, site=site, from_year=periodo[0], to_year=periodo[1])           
                if data and 'response_items' in data and data['response_items']:

                    df = pd.DataFrame(data['response_items'])
                    
                    if 'title' in df.columns:
                        df['site'] = site
                        df['model'] = model
                        df['periodo'] = f"{periodo[0]}-{periodo[1]}"
                        cols = ['site', 'model', 'periodo'] + [col for col in df.columns if col not in ['site', 'model', 'periodo']]
                        df = df[cols]
                        # filtrar os resultados indesejados (capas, vidros, etc.)
                        df = df[~df['title'].str.contains('capa|vidro|película|selfie', case=False, na=False)]
                        # filtrar para garantir que o nome do modelo esteja no título
                        df = df[df['title'].str.contains(model, case=False, na=False)]
                        
                        info.append(df)

    if info:
        result_df = pd.concat(info, ignore_index=True)
        return result_df
    else:
        print("Nenhum dado relevante foi encontrado.")
        return None

For better data extraction, we decided to create time periods, for better data search. The periods are: 2005-2009, 2010-2014, 2015-2019, 2020-2023.
We also decided to search only smartphones, because it is easier to compare prices of the same product in different stores and extract data from the websites.

In [None]:

periodos = [(2005,2009),(2010, 2015), (2016, 2020), (2021, 2024)]

models = [
    'huawei p8', 'huawei p20', 'huawei p30', 'huawei p40', 'huawei p50',
    'samsung galaxy s8', 'samsung galaxy s9', 'samsung galaxy s10', 'samsung galaxy s20', 'samsung galaxy s21', 'samsung galaxy s22',
    'apple iphone 6', 'apple iphone 7', 'apple iphone 8', 'apple iphone x', 'apple iphone xs', 'apple iphone xr', 'apple iphone 11', 'apple iphone 12', 'apple iphone 13', 'apple iphone 14',
    'huawei mate 20', 'huawei mate 30', 'huawei mate 40',
]


resultados = coletar_dados_por_periodo(models, sites, periodos)

if resultados is not None:
    # transforma os resultadosnum csv
    resultados.to_csv('data/smartphones_arquivo.csv', index=False)


In [21]:
df = pd.read_csv("data/smartphones_arquivo.csv")
df.head() 

Unnamed: 0,site,model,periodo,title,originalURL,linkToArchive,tstamp,contentLength,digest,mimeType,...,date,linkToScreenshot,linkToNoFrame,linkToExtractedText,linkToMetadata,linkToOriginalFile,snippet,fileName,collection,offset
0,www.fnac.pt,huawei p8,2016-2020,Huawei P8 Lite Huawei - Tecnologia - Fnac.pt,https://www.fnac.pt/Smartphones-e-Telemoveis/S...,https://arquivo.pt/wayback/20170819112525/http...,20170819112525,188977,d129cd257066d54e046fd147e20e50a0,text/html,...,1503141925,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201708191125...,https://arquivo.pt/textextracted?m=https%3A%2F...,https://arquivo.pt/textsearch?metadata=https%3...,https://arquivo.pt/noFrame/replay/201708191125...,<em>Huawei</em> <em>P8</em> Lite <em>Huawei</e...,IAH-20170819112306-68006-p81.arquivo.pt,AWP24,3773684
1,www.fnac.pt,huawei p8,2016-2020,"HUAWEI TELEMOVEL HUAWEI P8 DS 64GB, SmartPhone...",http://www.fnac.pt/HUAWEI-TELEMOVEL-HUAWEI-P8-...,https://arquivo.pt/wayback/20160211013245/http...,20160211013245,168566,e7cba63ab0ad2ff865ee01fc24aaeb21,text/html,...,1455154365,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201602110132...,https://arquivo.pt/textextracted?m=http%3A%2F%...,https://arquivo.pt/textsearch?metadata=http%3A...,https://arquivo.pt/noFrame/replay/201602110132...,<em>HUAWEI</em> TELEMOVEL <em>HUAWEI</em> <em>...,IAH-20160211012526-43988-p81.arquivo.pt,AWP20,44697354
2,www.fnac.pt,huawei p20,2016-2020,Smartphone Huawei P20 - 128GB - Black - SmartP...,https://www.fnac.pt/Smartphone-Huawei-P20-128G...,https://arquivo.pt/wayback/20190422222720/http...,20190422222720,311043,85b8ba9669ccbedeae1fcd8fe1d10abd,text/html,...,1555972040,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201904222227...,https://arquivo.pt/textextracted?m=https%3A%2F...,https://arquivo.pt/textsearch?metadata=https%3...,https://arquivo.pt/noFrame/replay/201904222227...,Smartphone <em>Huawei</em> <em>P20</em> - 128G...,WEB-20190422222653112-p81.arquivo.pt,AWP29,63201824
3,www.fnac.pt,huawei p20,2016-2020,Smartphone Huawei P20 Pro - 128GB - Black - Sm...,https://www.fnac.pt/Smartphone-Huawei-P20-Pro-...,https://arquivo.pt/wayback/20190422222711/http...,20190422222711,293053,2530c422432b25e633797fa5469eb6b5,text/html,...,1555972031,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201904222227...,https://arquivo.pt/textextracted?m=https%3A%2F...,https://arquivo.pt/textsearch?metadata=https%3...,https://arquivo.pt/noFrame/replay/201904222227...,Smartphone <em>Huawei</em> <em>P20</em> Pro - ...,WEB-20190422222653112-p81.arquivo.pt,AWP29,19579119
4,www.fnac.pt,huawei p30,2016-2020,Novos Huawei P30 - Sabe mais em Fnac.pt,https://www.fnac.pt/novos-huawei,https://arquivo.pt/wayback/20190323202914/http...,20190323202914,57729,38aaee64d7aa7b777d18759f0de09b53,text/html,...,1553372954,https://arquivo.pt/screenshot?url=https%3A%2F%...,https://arquivo.pt/noFrame/replay/201903232029...,https://arquivo.pt/textextracted?m=https%3A%2F...,https://arquivo.pt/textsearch?metadata=https%3...,https://arquivo.pt/noFrame/replay/201903232029...,Novos <em>Huawei</em> <em>P30</em> - Sabe mais...,WEB-20190323202843497-p81.arquivo.pt,AWP29,87173605


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   site                 128 non-null    object
 1   model                128 non-null    object
 2   periodo              128 non-null    object
 3   title                128 non-null    object
 4   originalURL          128 non-null    object
 5   linkToArchive        128 non-null    object
 6   tstamp               128 non-null    int64 
 7   contentLength        128 non-null    int64 
 8   digest               128 non-null    object
 9   mimeType             128 non-null    object
 10  encoding             128 non-null    object
 11  date                 128 non-null    int64 
 12  linkToScreenshot     128 non-null    object
 13  linkToNoFrame        128 non-null    object
 14  linkToExtractedText  128 non-null    object
 15  linkToMetadata       128 non-null    object
 16  linkToOr

In [23]:
print(df.isna().sum())

site                   0
model                  0
periodo                0
title                  0
originalURL            0
linkToArchive          0
tstamp                 0
contentLength          0
digest                 0
mimeType               0
encoding               0
date                   0
linkToScreenshot       0
linkToNoFrame          0
linkToExtractedText    0
linkToMetadata         0
linkToOriginalFile     0
snippet                0
fileName               0
collection             0
offset                 0
dtype: int64


In [24]:
df.columns

Index(['site', 'model', 'periodo', 'title', 'originalURL', 'linkToArchive',
       'tstamp', 'contentLength', 'digest', 'mimeType', 'encoding', 'date',
       'linkToScreenshot', 'linkToNoFrame', 'linkToExtractedText',
       'linkToMetadata', 'linkToOriginalFile', 'snippet', 'fileName',
       'collection', 'offset'],
      dtype='object')

## Comparar a Campanha Black Friday de 2018 a 2023

One of the principal problems is the API, not have information above 2020, but as we have to many information to extract with webscraping, we decided to use only information from the API and compare the Black Friday of 2018 to 2020

In [21]:
data = pd.read_csv('data/black_friday_portugal.csv')
data

Unnamed: 0,data,produtos
0,23-11-2018,"['Apple iPhone 8','Xbox One', 'PlayStation 4',..."
1,29-11-2019,"['MacBook Air', 'Apple iPhone 11', 'PlayStatio..."
2,27-11-2020,"['Nespresso', 'AirPods', 'Samsung Galaxy S10',..."
3,26-11-2021,"['Apple iPhone 12', 'Xbox One S', 'Samsung Gal..."
4,25-11-2022,"['Apple iPhone 13', 'MacBook Air', 'Sony WH-10..."
5,24-11-2023,"['Apple iPhone 14', 'Samsung Galaxy S22', 'Pla..."


To compare Black Friday from 2018 to 2020, we decided to research some products that were popular on each Black Friday date, such as smartphones, laptops, consoles, etc.


In [None]:
data = pd.read_csv('data/black_friday_portugal.csv', converters={'produtos': eval})

resultados_totais = []

for index, row in data.iterrows():
    date = row['data']
    formatted_date = pd.to_datetime(date, dayfirst=True).strftime('%Y%m%d')
    to_date = (pd.to_datetime(date, dayfirst=True) + pd.Timedelta(days=1)).strftime('%Y%m%d')

    # lista de produtos
    produtos = row['produtos']  

    for site in sites:
        for produto in produtos: 
            print(f"Extraindo dados para '{produto}' no site {site} na data {date}")
            result_data = arquivo_search(query=produto, site=site, from_year=formatted_date, to_year=to_date)
            
            if result_data and 'response_items' in result_data and result_data['response_items']:
                df = pd.DataFrame(result_data['response_items'])
                if 'title' in df.columns:
                    df['site'] = site
                    df['data'] = date
                    df['produto'] = produto 


                    cols = ['site', 'data', 'produto'] + [col for col in df.columns if col not in ['site', 'data', 'produto']]
                    df = df[cols]
                    resultados_totais.append(df)

if resultados_totais:
    resultados_df = pd.concat(resultados_totais, ignore_index=True)
    resultados_df.to_csv('data/resultados_black_friday_arquivo.csv', index=False)

General analyse of evolution of cyber monday and black friday in websites of the retailers.
We call api for the websites of the retailers and search for the words "black friday" and "cyber monday" in the text of the websites. We will compare the number of times that these words appear in the text of the websites of the retailers in the certain periods of time.

In [None]:
periodos = [
    (2006, 2009),
    (2010, 2013),
    (2014, 2015),
    (2016, 2017),
    (2018, 2019),
    (2020, 2021),
    (2021, 2024)
]

data_entries = []

for site in sites:
    for periodo in periodos:
        print(f"Extraindo dados para o site {site} no periodo {periodo[0]}-{periodo[1]}")

        result_data = arquivo_search(query='black friday', site=site, from_year=periodo[0], to_year=periodo[1])
        estimated_results = result_data.get('estimated_nr_results', 0) if result_data and 'response_items' in result_data and result_data['response_items'] else 0
        
        data_entries.append({
            'date': f"{periodo[0]}-{periodo[1]}",
            'type': 'black_friday',
            'site': site,
            'number': estimated_results
        })

        result_data = arquivo_search(query='cyber monday', site=site, from_year=periodo[0], to_year=periodo[1])
        estimated_results = result_data.get('estimated_nr_results', 0) if result_data and 'response_items' in result_data and result_data['response_items'] else 0
        
        data_entries.append({
            'date': f"{periodo[0]}-{periodo[1]}",
            'type': 'cyber_monday',
            'site': site,
            'number': estimated_results
        })

results_summary_df = pd.DataFrame(data_entries)

results_summary_df.to_csv('data/black_friday_summary.csv', index=False)