Building file paths for local use

In [1]:
import pandas as pd
import os

# Define the base directory where all your Eurostat files are stored
BASE_DIR = os.path.expanduser(r"~\OneDrive\Desktop\TIL Programming\6020 Group project\Project data_Freight") # change this to your folder path

# Build file paths safely using os.path.join
files = {
    "rail_go_total": os.path.join(BASE_DIR, "rail_go_total__custom_18309054_linear_2_0.csv"),
    "rail_if_line_na": os.path.join(BASE_DIR, "rail_if_line_na__custom_18309004_linear_2_0.csv"),
    "tran_hv_frmod": os.path.join(BASE_DIR, "tran_hv_frmod__custom_18309026_linear_2_0.csv"),
    "ttr00006": os.path.join(BASE_DIR, "ttr00006__custom_18309048_linear_2_0.csv"),
    "rail_go_consgmt": os.path.join(BASE_DIR, "rail_go_consgmt__custom_18308929_linear_2_0.csv"),
    "rail_go_grpgood": os.path.join(BASE_DIR, "rail_go_grpgood__custom_18309135_linear_2_0.csv")}

# EU27 country list (2020 definition)
EU27 = [
    "BE","BG","CZ","DK","DE","EE","IE","EL","ES","FR","HR","IT","CY",
    "LV","LT","LU","HU","MT","NL","AT","PL","PT","RO","SI","SK","FI","SE"]

Cleaning Functions

In [2]:
#Cleaning Geopolitcal data
def clean_geo(df):
    """Remove EU aggregates and keep only EU27 countries."""
    aggregates_pattern = r'EU|EA|EFTA|EWR|EUR|TOT'
    df = df[~df['geo'].astype(str).str.contains(aggregates_pattern, case=False, na=False)]
    df = df[df['geo'].isin(EU27)]
    return df

#Verify if each version is readable
def read_eurostat(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    print(f"Loading: {os.path.basename(path)}")
    return pd.read_csv(path, engine="python")

Loading and cleaning seperate Data sets

In [3]:
# Goods transported by rail (total)
rail_go_total = read_eurostat(files["rail_go_total"])
rail_go_total = rail_go_total[["geo", "TIME_PERIOD", "OBS_VALUE"]]
rail_go_total = clean_geo(rail_go_total)
rail_go_total.rename(columns={"OBS_VALUE": "rail_f_total_Nat_mio_tkm"}, inplace=True)

# Rail infrastructure length
rail_if_line_na = read_eurostat(files["rail_if_line_na"])
rail_if_line_na = rail_if_line_na[
    (rail_if_line_na["tra_infr"] == "TOTAL") &
    (rail_if_line_na["tra_meas"] == "FR_ONL")
]
rail_if_line_na = rail_if_line_na[["geo", "TIME_PERIOD", "OBS_VALUE"]]
rail_if_line_na = clean_geo(rail_if_line_na)
rail_if_line_na.rename(columns={"OBS_VALUE": "rail_length_km"}, inplace=True)

# Modal split (rail share)
tran_hv_frmod = read_eurostat(files["tran_hv_frmod"])
tran_hv_frmod = tran_hv_frmod[tran_hv_frmod["tra_mode"] == "RAIL"]
tran_hv_frmod = tran_hv_frmod[["geo", "TIME_PERIOD", "OBS_VALUE"]]
tran_hv_frmod = clean_geo(tran_hv_frmod)
tran_hv_frmod.rename(columns={"OBS_VALUE": "rail_modal_share_pc"}, inplace=True)

# Total rail freight (for ranking)
ttr00006 = read_eurostat(files["ttr00006"])
ttr00006 = ttr00006[["geo", "TIME_PERIOD", "OBS_VALUE"]]
ttr00006 = clean_geo(ttr00006)
ttr00006.rename(columns={"OBS_VALUE": "rail_f_total_INT_mio_tkm"}, inplace=True)

# Consignment types
rail_go_consgmt = read_eurostat(files["rail_go_consgmt"])
rail_go_consgmt = rail_go_consgmt[["geo", "TIME_PERIOD", "consign", "OBS_VALUE"]]
rail_go_consgmt = clean_geo(rail_go_consgmt)
# Pivot to columns per consignment type
rail_go_consgmt = rail_go_consgmt.pivot_table(
    index=["geo", "TIME_PERIOD"], columns="consign", values="OBS_VALUE"
).reset_index()
rail_go_consgmt.columns.name = None
rail_go_consgmt.rename(columns={
    "FT": "full_train_tkm",
    "FW": "full_wagon_tkm",
    "OT": "other_tkm",
    "TOTAL": "total_tkm"
}, inplace=True)

# Commodity groups (NST 2007)
rail_go_grpgood = read_eurostat(files["rail_go_grpgood"])
rail_go_grpgood.columns = [c.strip().replace('"', '') for c in rail_go_grpgood.columns]
if "nst07" in rail_go_grpgood.columns:
    rail_go_grpgood = rail_go_grpgood[["geo", "TIME_PERIOD", "nst07", "OBS_VALUE"]]
else:
    print("NST07 column not parsed cleanly, manual inspection needed.")
rail_go_grpgood = clean_geo(rail_go_grpgood)
rail_go_grpgood.rename(columns={"OBS_VALUE": "commodity_tkm"}, inplace=True)

Loading: rail_go_total__custom_18309054_linear_2_0.csv
Loading: rail_if_line_na__custom_18309004_linear_2_0.csv
Loading: tran_hv_frmod__custom_18309026_linear_2_0.csv
Loading: ttr00006__custom_18309048_linear_2_0.csv
Loading: rail_go_consgmt__custom_18308929_linear_2_0.csv
Loading: rail_go_grpgood__custom_18309135_linear_2_0.csv


Merge datasets

In [4]:
merged = (
    rail_go_total
    .merge(rail_if_line_na, on=["geo", "TIME_PERIOD"], how="left")
    .merge(tran_hv_frmod, on=["geo", "TIME_PERIOD"], how="left")
    .merge(ttr00006, on=["geo", "TIME_PERIOD"], how="left")
    .merge(rail_go_consgmt, on=["geo", "TIME_PERIOD"], how="left"))

Creating merged .csv file

In [5]:
output_path = os.path.join(BASE_DIR, "merged_eurostat_clean.csv")
merged.to_csv(output_path, index=False)
print(f"Merged dataset saved as: {output_path}")

Merged dataset saved as: C:\Users\youri\OneDrive\Desktop\TIL Programming\6020 Group project\Project data_Freight\merged_eurostat_clean.csv


Quick Check:

In [6]:
# Path to your merged dataset
DIR = os.path.expanduser(r"~\OneDrive\Desktop\TIL Programming\6020 Group project\Project data_Freight")
merged_path = os.path.join(DIR, "merged_eurostat_clean.csv")

# Load into pandas
df = pd.read_csv(merged_path)

# Quick preview
df.head()

Unnamed: 0,geo,TIME_PERIOD,rail_f_total_Nat_mio_tkm,rail_length_km,rail_modal_share_pc,rail_f_total_INT_mio_tkm,total_tkm,TRN_FUL,WAG_FUL
0,AT,2008,21915.0,0.0,33.6,,,,
1,AT,2009,17767.0,0.0,32.0,,,,
2,AT,2010,19833.0,0.0,33.0,,,,
3,AT,2011,20345.0,0.0,33.1,,,,
4,AT,2012,19499.0,0.0,32.7,,,,


Check structure

In [7]:
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1448 entries, 0 to 1447
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   geo                       1448 non-null   object 
 1   TIME_PERIOD               1448 non-null   int64  
 2   rail_f_total_Nat_mio_tkm  1376 non-null   float64
 3   rail_length_km            1078 non-null   float64
 4   rail_modal_share_pc       1348 non-null   float64
 5   rail_f_total_INT_mio_tkm  1128 non-null   float64
 6   total_tkm                 562 non-null    float64
 7   TRN_FUL                   596 non-null    float64
 8   WAG_FUL                   550 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 101.9+ KB


Unnamed: 0,geo,TIME_PERIOD,rail_f_total_Nat_mio_tkm,rail_length_km,rail_modal_share_pc,rail_f_total_INT_mio_tkm,total_tkm,TRN_FUL,WAG_FUL
count,1448,1448.0,1376.0,1078.0,1348.0,1128.0,562.0,596.0,550.0
unique,25,,,,,,,,
top,AT,,,,,,,,
freq,58,,,,,,,,
mean,,2017.041436,39440.710029,439.669625,23.576113,39613.358156,45510.197509,29158.770134,18518.423636
std,,4.530288,62991.249683,838.580087,18.005468,63418.656122,59327.43529,47598.172566,17746.123546
min,,2008.0,67.0,0.0,0.6,67.0,0.0,0.0,0.0
25%,,2014.0,6532.0,0.0,10.8,6532.0,10376.5,0.0,2961.125
50%,,2017.0,16291.5,46.0,21.85,16347.0,27612.0,7406.5,20418.0
75%,,2021.0,48867.0,340.0,30.4,48900.25,44278.0,28254.5,27550.375


Check for missing data

In [8]:
(df.notna().sum() / len(df) * 100).round(1)

geo                         100.0
TIME_PERIOD                 100.0
rail_f_total_Nat_mio_tkm     95.0
rail_length_km               74.4
rail_modal_share_pc          93.1
rail_f_total_INT_mio_tkm     77.9
total_tkm                    38.8
TRN_FUL                      41.2
WAG_FUL                      38.0
dtype: float64

Diagnose what went wrong with network length data

In [9]:
# rail_if_line_na = read_eurostat(files["rail_if_line_na"])
# print(rail_if_line_na.columns.tolist())
# print(rail_if_line_na["tra_infr"].unique())
# print(rail_if_line_na["tra_meas"].unique())
# rail_if_line_na.head()