## Data Scraping from Markets Business Insider

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [1]:
def scrape_index_components_to_csv(url, csv_filename):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Create a list to hold all the company data
    companies_data = []

    # Extract the header names for columns
    headers = soup.select_one('table.table thead').find_all('th')
    columns = [header.get_text(strip=True) for header in headers]

    # Loop through the rows and extract data
    for row in soup.select('table.table tbody tr'):
        # Get all columns in the row
        cols = row.find_all('td')
        # Extract text and split on new lines or other appropriate separators
        col_text = [col.get_text(" ", strip=True) for col in cols]
        companies_data.append(col_text)

    # Create the DataFrame
    df = pd.DataFrame(companies_data, columns=columns)

    # Clean the DataFrame to remove rows with no company name
    df = df.dropna(subset=[columns[0]])  # assuming the first column is the company name

    # Save the DataFrame as CSV
    df.to_csv(f'C:/Users/Victor/Documents/GitHub/hslu-cip/data/raw_data_stage1/{csv_filename}', index=False)

    return df


In [2]:
def scrape_wikipedia_to_csv(url, csv_filename, table_position):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Since the tables have a common class, we find all and then select the third one
    tables = soup.find_all('table', {'class': 'wikitable sortable'})
    
    # Table position
    table = tables[table_position] 
    
    # Create a list to hold all the company data
    companies_data = []

    # Extract the header names for columns
    headers = table.find_all('th')
    columns = [header.get_text(strip=True) for header in headers]

    # Loop through the rows and extract data
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        if cols:  # This checks if there are any 'td' elements found to avoid header rows
            col_text = [col.get_text(" ", strip=True) for col in cols]
            companies_data.append(col_text)

    # Create the DataFrame
    df = pd.DataFrame(companies_data, columns=columns)

    # Clean the DataFrame to remove rows with no company name
    # assuming the first column is the company name
    df = df.dropna(subset=[columns[0]])  

    # Save the DataFrame as CSV
    df.to_csv(f'C:/Users/Victor/Documents/GitHub/hslu-cip/data/raw_data_stage1/{csv_filename}', index=False)


    return df

### AEX. Amsterdam Exchange Index

In [4]:
url = "https://markets.businessinsider.com/index/components/aex"
csv_filename = 'aex_stock_data_stage1.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                         Name Latest PricePrevious Close    LowHigh  \
0                    ABN Amro                15.62 15.62  0.00 0.00   
1   Adyen B.V. Parts Sociales          1,442.60 1,442.60  0.00 0.00   
2              Ahold Delhaize                27.14 27.14  0.00 0.00   
3                  Akzo Nobel                66.10 66.10  0.00 0.00   
4               ArcelorMittal                23.80 23.80  0.00 0.00   
5                     ASML NV              840.30 840.30  0.00 0.00   
6           ASR Nederland N.V                45.50 45.50  0.00 0.00   
7                    Heineken                88.00 88.00  0.00 0.00   
8                        IMCD              153.10 153.10  0.00 0.00   
9                   ING Group                15.17 15.17  0.00 0.00   
10                        KPN                  3.38 3.38  0.00 0.00   
11                   NN Group                42.69 42.69  0.00 0.00   
12                    Philips                18.90 18.90  0.00 0.00   
13    

In [5]:
url = "https://en.wikipedia.org/wiki/AEX_index"
csv_filename = 'aex_wikipedia_data_stage1.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

                      Company                                   ICBSector  \
0                       Adyen  support services, financial administration   
1                       Aegon                              life insurance   
2              Ahold Delhaize                food retailers & wholesalers   
3                   AkzoNobel                         specialty chemicals   
4               ArcelorMittal                                iron & steel   
5           ASM International                              semiconductors   
6                ASML Holding                              semiconductors   
7       Universal Music Group                               entertainment   
8           BE Semiconductors                              semiconductors   
9            DSM Firmenich AG                         specialty chemicals   
10                   Heineken                                     brewers   
11                       IMCD                         specialty chemicals   

### BEL 20. Brussels Stock Exchange

In [6]:
url = "https://markets.businessinsider.com/index/components/bel_20"
csv_filename = 'bel_20_stock_data_stage1.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                     Name Latest PricePrevious Close  \
0   AB InBev SA-NV (Anheuser-Busch InBev)                54.88 54.88   
1             Ackermans & van Haaren S.A.              156.70 156.70   
2                                ageas NV                42.98 42.98   
3                  Ahold Delhaize (Ahold)                27.22 27.22   
4                          Cofinimmo S.A.                61.50 61.50   
5                            D'Ieteren NV              202.80 202.80   
6              Elia System Operator SA-NV                91.50 91.50   
7                     Engie (ex GDF Suez)                15.90 15.90   
8        Etablissementen Franz Colruyt NV                40.34 40.34   
9     Groupe Bruxelles Lambert S.A. (GBL)                68.85 68.85   
10                         KBC Groep N.V.                69.02 69.02   
11                          NV Bekaert SA                46.00 46.00   
12                 Proximus (ex Belgacom)                  7.25 

In [7]:
url = "https://en.wikipedia.org/wiki/BEL_20"
csv_filename = 'bel_20_wikipedia_data_stage1.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

                   Company             ICBSector              Ticker symbol  \
0                 AB InBev       Food & Beverage    Euronext Brussels : ABI   
1   Ackermans & van Haaren    Financial Services   Euronext Brussels : ACKB   
2                 Aedifica           Real Estate    Euronext Brussels : AED   
3                    Ageas             Insurance    Euronext Brussels : AGS   
4                   Aperam       Basic Resources   Euronext Brussels : APAM   
5           arGEN-X [ nl ]           Health Care   Euronext Brussels : ARGX   
6                Cofinimmo           Real Estate   Euronext Brussels : COFB   
7               Elia Group             Utilities    Euronext Brussels : ELI   
8                Galapagos           Health Care   Euronext Brussels : GLPG   
9                      GBL    Financial Services   Euronext Brussels : GBLB   
10                     KBC                 Banks    Euronext Brussels : KBC   
11          Melexis [ nl ]  Electronic Equipment   E

### CAC 40. Cotation Assistée en Continu
Benchmark French Stock Market Index

In [8]:
url = "https://markets.businessinsider.com/index/components/cac_40"
csv_filename = 'cac_40_stock_data_stage1.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                Name Latest PricePrevious Close    LowHigh  \
0                        Air Liquide              186.72 186.72  0.00 0.00   
1                             Airbus              160.66 160.66  0.00 0.00   
2                      ArcelorMittal                23.80 23.80  0.00 0.00   
3                                AXA                33.82 33.82  0.00 0.00   
4                        BNP Paribas                65.46 65.46  0.00 0.00   
5                           Bouygues                36.09 36.09  0.00 0.00   
6                         Cap Gemini              200.50 200.50  0.00 0.00   
7                          Carrefour                15.72 15.72  0.00 0.00   
8                    Crédit Agricole                13.91 13.91  0.00 0.00   
9                             Danone                59.06 59.06  0.00 0.00   
10                             Engie                15.82 15.82  0.00 0.00   
11                  EssilorLuxottica              204.70 204.70 

In [9]:
url = "https://en.wikipedia.org/wiki/CAC_40"
csv_filename = 'cac_40_wikipedia_data_stage1.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

                      Company                  Sector  \
0                 Air Liquide         Basic Materials   
1                      Airbus             Industrials   
2                      Alstom             Industrials   
3               ArcelorMittal         Basic Materials   
4                         AXA      Financial Services   
5                 BNP Paribas      Financial Services   
6                    Bouygues             Industrials   
7                   Capgemini              Technology   
8                   Carrefour      Consumer Defensive   
9             Crédit Agricole      Financial Services   
10                     Danone      Consumer Defensive   
11          Dassault Systèmes              Technology   
12                    Edenred             Industrials   
13                      Engie               Utilities   
14           EssilorLuxottica              Healthcare   
15        Eurofins Scientific              Healthcare   
16                     Hermès  

### ISEQ 20. Ireland Overall Stock Exchange Index

In [10]:
url = "https://markets.businessinsider.com/index/components/iseq"
csv_filename = 'iseq_20_stock_data_stage1.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                 Name Latest PricePrevious Close      LowHigh          +/-%  \
0             CRH plc                72.18 72.18  71.48 72.18  -0.94 -1.29%   
1     Kerry Group plc                78.15 78.15  78.15 78.15  -1.15 -1.45%   
2  Kingspan Group plc                79.60 79.60  79.60 79.60  -0.40 -0.50%   

                                            TimeDate     3 Mo.+/-%  \
0  05:52 AM 04/21/2024 05:52:53 AM UTC-0400 19.04...  13.30 21.33%   
1  02:06 AM 04/21/2024 02:06:36 AM UTC-0400 19.04...    3.80 4.97%   
2  02:26 AM 04/21/2024 02:26:01 AM UTC-0400 19.04...  10.82 14.99%   

      6 Mo.+/-%      1 Year+/-%  
0  11.63 62.97%    30.46 67.39%  
1    0.00 0.00%  -17.70 -18.06%  
2    0.00 0.00%    22.78 37.83%  


In [11]:
url = "https://en.wikipedia.org/wiki/ISEQ_20"
csv_filename = 'iseq_20_wikipedia_data_stage1.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 0)
print(df)

                 MNEM code                            Company        Domicile
0    Euronext Dublin : A5G                          AIB Group         Ireland
1   Euronext Dublin : BIRG                    Bank of Ireland         Ireland
2    Euronext Dublin : C5H                        Cairn Homes         Ireland
3    Euronext Dublin : DHG                 Dalata Hotel Group         Ireland
4    Euronext Dublin : EG7                       FBD Holdings         Ireland
5    Euronext Dublin : GYQ                    FD Technologies  United Kingdom
6    Euronext Dublin : GL9                            Glanbia         Ireland
7    Euronext Dublin : GVR               Glenveagh Properties         Ireland
8    Euronext Dublin : GRP               Greencoat Renewables         Ireland
9   Euronext Dublin : IR5B            Irish Continental Group         Ireland
10  Euronext Dublin : IRES  Irish Residential Properties REIT         Ireland
11   Euronext Dublin : KMR                  Kenmare Resources   

### OBX. Oslo Stock Exchange

In [12]:
url = "https://markets.businessinsider.com/index/components/obx"
csv_filename = 'obx_stock_data_stage1.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                    Name Latest PricePrevious Close  \
0                     Aker Solutions ASA                  3.25 3.25   
1                             BW LPG Ltd                12.29 12.29   
2             Det Norske Oljeselskap ASA                23.80 23.80   
3              DNO International ASA (A)                  0.88 0.88   
4                                Equinor                25.52 25.52   
5              Gjensidige Forsikring ASA                13.63 13.63   
6                                   Mowi                15.50 15.50   
7               Nordic Semiconductor ASA                  7.59 7.59   
8                        Norsk Hydro ASA                  6.04 6.04   
9                  Norwegian Air Shuttle                  1.41 1.41   
10                              Orkla AS                  6.10 6.10   
11                Otello Corporation ASA                  0.67 0.67   
12            Petroleum Geo-Services ASA                  0.73 0.73   
13    

In [13]:
url = "https://en.wikipedia.org/wiki/OBX_Index"
csv_filename = 'obx_wikipedia_data_stage1.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 0)
print(df)

                  Company                                ICBsubsector  \
0                 Aker BP                        oil: crude producers   
1                  BW LPG                       marine transportation   
2                DNB Bank                                       banks   
3                 Equinor                      integrated oil and gas   
4               Frontline                       marine transportation   
5      Golden Ocean Group                       marine transportation   
6          Hafnia Limited                       marine transportation   
7    Höegh Autoliners ASA                       marine transportation   
8       Kongsberg Gruppen                     diversified industrials   
9                    Mowi  farming, fishing, ranching and plantations   
10    MPC Container Ships                       marine transportation   
11                    NEL                  renewable energy equipment   
12   Nordic Semiconductor                          

### OSEBX. Oslo Børs All Share Index

In [14]:
url = "https://markets.businessinsider.com/index/components/osebx"
csv_filename = 'osebx_stock_data_stage1.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                    Name Latest PricePrevious Close  \
0              ABG Sundal Collier ASAShs                  0.47 0.47   
1                     Af Gruppen Asa (A)                11.00 11.00   
2                        Aker ASAShs -A-                51.70 51.70   
3                     Aker Solutions ASA                  3.25 3.25   
4          American Shipping Company ASA                  2.42 2.42   
5                               Atea ASA                10.98 10.98   
6                 Avance Gas Holding Ltd                11.70 11.70   
7                             Bakkafrost                54.90 54.90   
8                Biotec Pharmacon ASAShs                  2.25 2.25   
9                             BW LPG Ltd                12.29 12.29   
10            Det Norske Oljeselskap ASA                23.80 23.80   
11             DNO International ASA (A)                  0.88 0.88   
12          Entra ASA Unitary 144A-Reg S                  8.56 8.56   
13    

In [15]:
url = "https://en.wikipedia.org/wiki/OSEAX"
csv_filename = 'osebx_wikipedia_data_stage1.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

    Ticker                   Name Mkt Cap (Million NOK)
0     ACTA           Acta Holding                659.28
1      ACY                 Acergy              24124.35
2      AFG             AF Gruppen               2840.85
3      AFK  Arendals Fossekompani               3538.59
4      AGR              AGR Group               1758.13
..     ...                    ...                   ...
174    WRL    Wentworth Resources                178.99
175  WWASA       Wilh. Wilhelmsen                  7458
176    WWI   Wilh. Wilhelmsen ...               4727.96
177   WWIB   Wilh. Wilhelmsen ...               1619.81
178    YAR     Yara International              83428.89

[179 rows x 3 columns]


### PSI. Portuguese Stock Index

In [16]:
url = "https://markets.businessinsider.com/index/components/psi20"
csv_filename = 'psi_20_stock_data_stage1.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                                 Name  \
0                                    ALTRI SGPS SAShs   
1                                      Cofina SGPS SA   
2                                  EDP Renovaveis, SA   
3                                        GALP Energia   
4     Grupo EDP S.A. (Electricidade de Portugal S.A.)   
5                          Jeronimo Martins SGPS S.A.   
6                                  Mota-Engil SGPS SA   
7                                        NOS SGPS, SA   
8                               Portugal Telecom S.A.   
9       REN - Redes Energeticas Nacionais SGPS, SAShs   
10  Sociedade de Investimento e Gestao SGPS SA SEMAPA   
11                                      Sonae SGPS SA   
12                                   Sonaecom SGPS SA   
13                              The Navigator Company   

   Latest PricePrevious Close      LowHigh          +/-%  \
0                   4.85 4.85    4.85 4.88  -0.05 -1.10%   
1                   0.39

In [17]:
url = "https://en.wikipedia.org/wiki/PSI-20"
csv_filename = 'psi_20_wikipedia_data_stage1.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

                        Company                              Industry Ticker  \
0                         Altri                       Basic Resources   ALTR   
1     Banco Comercial Português                                 Banks    BCP   
2             Corticeira Amorim           Industrial Goods & Services    COR   
3      CTT Correios de Portugal           Industrial Goods & Services    CTT   
4                EDP Renováveis                             Utilities   EDPR   
5          Energias de Portugal                             Utilities    EDP   
6                  Galp Energia                                Energy   GALP   
7                       Ibersol                      Travel & Leisure    IBS   
8              Jerónimo Martins  Personal Care, Drug & Grocery Stores    JMT   
9                    Mota-Engil              Construction & Materials    EGL   
10                          NOS                    Telecommunications    NOS   
11                     Novabase         