<a href="https://colab.research.google.com/github/francesco-scomazzon/CapstoneProject/blob/main/WorldNuclearPowerReactors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data scraping**

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

link = "https://world-nuclear.org/information-library/facts-and-figures/world-nuclear-power-reactors-and-uranium-requireme"

request = requests.get(link)
soup = BeautifulSoup(request.text, "html.parser")

In [3]:
old_table_header = []
for th in soup.select('thead td'):
    text = th.get_text(strip=True)
    if text:
        old_table_header.append(text)

table_header = []
table_header.append(old_table_header[0].split("(")[0].strip())
table_header.append(old_table_header[1] + " (TWh)")
table_header.append(old_table_header[1] + " (% e)")
table_header.append(old_table_header[2] + " (No.)")
table_header.append(old_table_header[2] + " (MWe net)")
table_header.append(old_table_header[3] + " (No.)")
table_header.append(old_table_header[3] + " (MWe gross)")
table_header.append(old_table_header[4] + " (No.)")
table_header.append(old_table_header[4] + " (MWe gross)")
table_header.append(old_table_header[5] + " (No.)")
table_header.append(old_table_header[5] + " (MWe gross)")
table_header.append(old_table_header[6] + " (tonnes U)")

print(table_header)

['COUNTRY', 'NUCLEAR ELECTRICITY GENERATION 2022 (TWh)', 'NUCLEAR ELECTRICITY GENERATION 2022 (% e)', 'REACTORS OPERABLE (No.)', 'REACTORS OPERABLE (MWe net)', 'REACTORS UNDER CONSTRUCTION (No.)', 'REACTORS UNDER CONSTRUCTION (MWe gross)', 'REACTORS PLANNED (No.)', 'REACTORS PLANNED (MWe gross)', 'REACTORS PROPOSED (No.)', 'REACTORS PROPOSED (MWe gross)', 'URANIUM REQUIRED 2024 (tonnes U)']


In [4]:
table_rows = []
for tr in soup.select('tbody tr'):
    row = []
    # Extracting the country name
    th = tr.find('th')
    if th:
        country_name = th.get_text(strip=True)
        row.append(country_name)

    # Extracting the other data cells
    for td in tr.find_all('td'):
        text = td.get_text(strip=True)
        if text:
            row.append(text)
    if row:
        table_rows.append(row)

print(table_rows)

[['Argentina', '7.5', '5.4', '3', '1641', '1', '29', '1', '1150', '1', '750', '219'], ['Armenia', '2.6', '31.0', '1', '416', '0', '0', '0', '0', '1', '1060', '55'], ['Bangladesh', '0', '0', '0', '0', '2', '2400', '0', '0', '2', '2400', '371'], ['Belarus', '4.4', '11.9', '2', '2220', '0', '0', '0', '0', '0', '0', '357'], ['Belgium', '41.7', '46.4', '5', '3916', '0', '0', '0', '0', '0', '0', '516'], ['Brazil', '13.7', '2.5', '2', '1884', '1', '1405', '0', '0', '8', '8000', '339'], ['Bulgaria', '15.8', '32.6', '2', '2006', '0', '0', '2', '2300', '0', '0', '334'], ['Canada', '81.7', '12.9', '19', '13,661', '0', '0', '2', '400', '9', '5700', '1455'], ['China', '395.4', '5.0', '56', '54,362', '26', '29,755', '41', '44,660', '158', '186,450', '13,132'], ['Czech Republic', '29.3', '36.7', '6', '4212', '0', '0', '1', '1200', '3', '3600', '715'], ['Egypt', '0', '0', '0', '0', '4', '4800', '0', '0', '0', '0', '0'], ['Finland', '24.2', '35.0', '5', '4369', '0', '0', '0', '0', '0', '0', '616'], ['F

In [5]:
df = pd.DataFrame(table_rows, columns=table_header)

df

Unnamed: 0,COUNTRY,NUCLEAR ELECTRICITY GENERATION 2022 (TWh),NUCLEAR ELECTRICITY GENERATION 2022 (% e),REACTORS OPERABLE (No.),REACTORS OPERABLE (MWe net),REACTORS UNDER CONSTRUCTION (No.),REACTORS UNDER CONSTRUCTION (MWe gross),REACTORS PLANNED (No.),REACTORS PLANNED (MWe gross),REACTORS PROPOSED (No.),REACTORS PROPOSED (MWe gross),URANIUM REQUIRED 2024 (tonnes U)
0,Argentina,7.5,5.4,3,1641,1,29,1,1150,1,750,219
1,Armenia,2.6,31.0,1,416,0,0,0,0,1,1060,55
2,Bangladesh,0.0,0,0,0,2,2400,0,0,2,2400,371
3,Belarus,4.4,11.9,2,2220,0,0,0,0,0,0,357
4,Belgium,41.7,46.4,5,3916,0,0,0,0,0,0,516
5,Brazil,13.7,2.5,2,1884,1,1405,0,0,8,8000,339
6,Bulgaria,15.8,32.6,2,2006,0,0,2,2300,0,0,334
7,Canada,81.7,12.9,19,13661,0,0,2,400,9,5700,1455
8,China,395.4,5.0,56,54362,26,29755,41,44660,158,186450,13132
9,Czech Republic,29.3,36.7,6,4212,0,0,1,1200,3,3600,715


**Data cleaning**

In [6]:
def clean_column_header(header):
    words = header.split()
    cleaned_words = [word.capitalize() if idx == 0 else word.lower() for idx, word in enumerate(words)]
    return " ".join(cleaned_words)

df = df.rename(columns=lambda x: clean_column_header(x))
df



Unnamed: 0,Country,Nuclear electricity generation 2022 (twh),Nuclear electricity generation 2022 (% e),Reactors operable (no.),Reactors operable (mwe net),Reactors under construction (no.),Reactors under construction (mwe gross),Reactors planned (no.),Reactors planned (mwe gross),Reactors proposed (no.),Reactors proposed (mwe gross),Uranium required 2024 (tonnes u)
0,Argentina,7.5,5.4,3,1641,1,29,1,1150,1,750,219
1,Armenia,2.6,31.0,1,416,0,0,0,0,1,1060,55
2,Bangladesh,0.0,0,0,0,2,2400,0,0,2,2400,371
3,Belarus,4.4,11.9,2,2220,0,0,0,0,0,0,357
4,Belgium,41.7,46.4,5,3916,0,0,0,0,0,0,516
5,Brazil,13.7,2.5,2,1884,1,1405,0,0,8,8000,339
6,Bulgaria,15.8,32.6,2,2006,0,0,2,2300,0,0,334
7,Canada,81.7,12.9,19,13661,0,0,2,400,9,5700,1455
8,China,395.4,5.0,56,54362,26,29755,41,44660,158,186450,13132
9,Czech Republic,29.3,36.7,6,4212,0,0,1,1200,3,3600,715


**Merge per filtrare solo i paesi in Europa**

In [7]:
Country = [
    "Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belgium", "Bosnia and Herzegovina",
    "Bulgaria", "Croatia", "Cyprus", "Czech Republic", "Denmark", "Estonia", "Finland",
    "France", "Georgia", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Italy",
    "Latvia", "Liechtenstein", "Lithuania", "Luxembourg", "Malta", "Moldova", "Monaco",
    "Montenegro", "Netherlands", "North Macedonia", "Norway", "Poland", "Portugal",
    "Romania", "San Marino", "Serbia", "Slovakia", "Slovenia", "Spain", "Sweden", "Switzerland",
    "Turkey", "Ukraine", "United Kingdom"
]

Country_df = pd.DataFrame(Country, columns=["Country"])

european_nuclear_df = df.merge(Country_df, how="inner", on="Country")
european_nuclear_df

Unnamed: 0,Country,Nuclear electricity generation 2022 (twh),Nuclear electricity generation 2022 (% e),Reactors operable (no.),Reactors operable (mwe net),Reactors under construction (no.),Reactors under construction (mwe gross),Reactors planned (no.),Reactors planned (mwe gross),Reactors proposed (no.),Reactors proposed (mwe gross),Uranium required 2024 (tonnes u)
0,Armenia,2.6,31.0,1,416,0,0,0,0,1,1060,55
1,Belgium,41.7,46.4,5,3916,0,0,0,0,0,0,516
2,Bulgaria,15.8,32.6,2,2006,0,0,2,2300,0,0,334
3,Czech Republic,29.3,36.7,6,4212,0,0,1,1200,3,3600,715
4,Finland,24.2,35.0,5,4369,0,0,0,0,0,0,616
5,France,282.1,62.5,56,61370,1,1650,0,0,6,9900,8232
6,Germany,31.9,5.8,0,0,0,0,0,0,0,0,0
7,Hungary,15.0,47.0,4,1916,0,0,2,2400,0,0,320
8,Netherlands,3.9,3.3,1,482,0,0,0,0,2,2000,69
9,Poland,0.0,0.0,0,0,0,0,3,3750,26,10000,0


Esportare il DataFrame da Google Colab in formato csv

In [8]:
european_nuclear_df.to_csv("EuropeanNuclearPowerReactors.csv", index=False)

from google.colab import files

files.download("EuropeanNuclearPowerReactors.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>