In [1]:
import pandas as pd
import os
import csv


In [73]:
european_countries_iso3 = pd.read_csv("../data/raw/European_Countries_ISO3_Codes.csv")

In [74]:
path_to_ess = '../data/raw/ESS.html'

In [75]:
from bs4 import BeautifulSoup

# Load the HTML file
with open(path_to_ess, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

# Extract the specific section (assuming 'cntry' section is unique or well-marked)
# Adjust 'id' or 'class' selectors based on actual HTML structure.
cntry_section = soup.find(string='cntry')  # Locate the 'cntry' section header

if cntry_section:
    # Find the parent or the appropriate sibling element containing the details
    table = cntry_section.find_next('table')  # Adjust to locate the relevant content structure
    
    # Extract rows of the table (assuming it's in a table structure)
    rows = table.find_all('tr')
    
    # Parse each row for country codes and names
    country_mapping = {}
    for row in rows[1:]:  # Skip header row
        columns = row.find_all('td')
        if len(columns) >= 2:
            code = columns[0].text.strip()
            name = columns[1].text.strip()
            country_mapping[code] = name
    
    csv_filename = '../data/raw/country_mapping.csv'

with open(csv_filename, mode='w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write header
    csv_writer.writerow(['Country', 'ISO2 Code'])
    # Write rows
    for code, name in country_mapping.items():
        csv_writer.writerow([name, code])

print(f"CSV file '{csv_filename}' has been created successfully.")


CSV file '../data/raw/country_mapping.csv' has been created successfully.


In [76]:
european_countries_iso3.head()

Unnamed: 0,Country,ISO3 Code
0,Albania,ALB
1,Andorra,AND
2,Austria,AUT
3,Belarus,BLR
4,Belgium,BEL


In [77]:
european_countries_iso2 = pd.read_csv("../data/raw/country_mapping.csv")

In [78]:
european_countries_iso2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Country    40 non-null     object
 1   ISO2 Code  40 non-null     object
dtypes: object(2)
memory usage: 768.0+ bytes


In [79]:
european_countries_iso3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Country    47 non-null     object
 1   ISO3 Code  47 non-null     object
dtypes: object(2)
memory usage: 880.0+ bytes


In [80]:
eruopean_countries = pd.merge(european_countries_iso3, european_countries_iso2, on='Country', how='outer')  # 'inner' keeps only matches


In [81]:
eruopean_countries.head(20)


Unnamed: 0,Country,ISO3 Code,ISO2 Code
0,Albania,ALB,AL
1,Andorra,AND,
2,Austria,AUT,AT
3,Belarus,BLR,
4,Belgium,BEL,BE
5,Bosnia & Herzegovina,BIH,
6,Bulgaria,BGR,BG
7,Croatia,HRV,HR
8,Cyprus,CYP,CY
9,Czech Republic,CZE,


In [82]:
eruopean_countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Country    51 non-null     object
 1   ISO3 Code  47 non-null     object
 2   ISO2 Code  40 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


In [151]:

eruopean_countries = eruopean_countries.drop([20])

In [183]:
eruopean_countries.head(30)

Unnamed: 0,Country,ISO3 Code,ISO2 Code
0,Albania,ALB,AL
1,Andorra,AND,AD
2,Austria,AUT,AT
3,Belarus,BLR,BY
4,Belgium,BEL,BE
5,Bosnia & Herzegovina,BIH,BA
6,Bulgaria,BGR,BG
7,Croatia,HRV,HR
8,Cyprus,CYP,CY
9,Czech Republic,CZE,CZ


In [153]:
eruopean_countries = eruopean_countries.reset_index()

In [182]:
eruopean_countries = eruopean_countries.drop(labels=["level_0"], axis=1)

In [176]:
eruopean_countries = eruopean_countries.reset_index()

In [184]:
eruopean_countries.to_csv("../data/processed/europe_countries.csv", index=False, index_label=False)