In [2]:
import pandas as pd
import numpy as np
import duckdb

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
data_folder = "staticNotDeployed/"
static_folder = "static/"

In [5]:
# scenario_BNZ_path = "simulations_new/BNZ.csv"
# scenario_holder_path = "simulations_new/BNZ.csv"

# New data march 2025
scenario_BNZ_path = data_folder + "simulations_new/march2025/BNZ_£millions_annualy.csv"
socio_factors_path = "UK_Archetypes_global_measures.csv"

## Create pandas tables

In [7]:
df_socio = pd.read_csv(static_folder+socio_factors_path)

In [8]:
# Remove one row on NaN
df_socio = df_socio.dropna()

In [9]:
df_socio = df_socio.convert_dtypes()

In [10]:
df_socio

Unnamed: 0,Nation,LAD,MSOA,LSOA.DZ.CD,LSOA.DZ.NM,Under.35,Over.65,EPC,Median.Income,Tenure,...,Rurality,House.value,Fuel.Type,Fuel.consumption.total,Floor.area,Gas.flag,Number.cars,Urban.trips,Total.vkm,Urban.vkm
0,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,0.25,4,23318,1,...,2,137688,3,37665,180,0,2,414,12923,2584
1,NI,N09000001,-,N20000002,Dunsilly_B1,0.65,0.35,4,23318,1,...,2,149950,3,35381,167,0,2,376,11730,2346
2,NI,N09000001,-,N20000003,Dunsilly_A2,0.66,0.34,4,23318,1,...,2,137688,3,32461,143,0,2,369,11504,2301
3,NI,N09000001,-,N20000004,Dunsilly_A3,0.73,0.27,5,23318,1,...,2,137688,3,30303,126,0,2,387,12075,2415
4,NI,N09000001,-,N20000005,Dunsilly_B2,0.7,0.3,4,23318,1,...,2,134250,3,35166,182,0,2,408,12719,2544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46547,Scotland,S12000040,-,S01013477,Broxburn South - 06,0.34,0.23,3.0,23951,1,...,1,175000,1,31622,97,1.0,2,392,15827,10491
46548,Scotland,S12000040,-,S01013478,Broxburn East - 01,0.42,0.14,3.0,34382,1,...,1,130000,1,12784,68,1.0,2,699,15224,10091
46549,Scotland,S12000040,-,S01013479,Broxburn East - 02,0.34,0.26,3.0,21767,1,...,1,72500,1,15801,69,1.0,2,461,14259,9452
46550,Scotland,S12000040,-,S01013480,Broxburn East - 03,0.4,0.23,3.0,23972,1,...,1,72500,1,17025,75,1.0,2,360,13406,8886


In [11]:
df_socio.rename(columns=lambda x: x.replace('.', '_'), inplace=True)

In [12]:
df_socio.columns

Index(['Nation', 'LAD', 'MSOA', 'LSOA_DZ_CD', 'LSOA_DZ_NM', 'Under_35',
       'Over_65', 'EPC', 'Median_Income', 'Tenure', 'Typology', 'Unemployment',
       'Rurality', 'House_value', 'Fuel_Type', 'Fuel_consumption_total',
       'Floor_area', 'Gas_flag', 'Number_cars', 'Urban_trips', 'Total_vkm',
       'Urban_vkm'],
      dtype='object')

In [13]:
# EPC and gas_flag have weird values
df_socio.dtypes

Nation                    string[python]
LAD                       string[python]
MSOA                      string[python]
LSOA_DZ_CD                string[python]
LSOA_DZ_NM                string[python]
Under_35                         Float64
Over_65                          Float64
EPC                               object
Median_Income                      Int64
Tenure                             Int64
Typology                           Int64
Unemployment                     Float64
Rurality                           Int64
House_value                        Int64
Fuel_Type                          Int64
Fuel_consumption_total             Int64
Floor_area                         Int64
Gas_flag                          object
Number_cars                        Int64
Urban_trips                        Int64
Total_vkm                          Int64
Urban_vkm                          Int64
dtype: object

In [14]:
set(df_socio.EPC)

{1.0, '2', 2.0, '3', 3.0, '4', 4.0, '5', 5.0, '6', 6.0, '7', 7.0, 'd'}

In [15]:
set(df_socio.Gas_flag)

{'0', 0.0, '1', 1.0, 'Y'}

In [16]:
# Replace the string 'unknown' with pd.NA
df_socio['Gas_flag'] = df_socio['Gas_flag'].replace('Y', pd.NA)

# Step 2: Convert to numeric (converts '1.0' to 1.0 as float)
df_socio['Gas_flag'] = pd.to_numeric(df_socio['Gas_flag'], errors='coerce')

df_socio['Gas_flag'] = df_socio['Gas_flag'].astype('Int16')

In [17]:
# Replace the string 'unknown' with pd.NA
df_socio['EPC'] = df_socio['EPC'].replace('d', pd.NA)

# Step 2: Convert to numeric (converts '1.0' to 1.0 as float)
df_socio['EPC'] = pd.to_numeric(df_socio['EPC'], errors='coerce')

df_socio['EPC'] = df_socio['EPC'].astype('Int16')

### Cobenefit table

In [18]:
df = pd.read_csv(scenario_BNZ_path)

In [19]:
# Step: Drop rows where any column contains the value '#DIV/0!'
# There are rows with these weird values in the new dataset
df = df[~df.isin(['#DIV/0!']).any(axis=1)]

In [20]:
# To not change the queries in the app
df["scenario"] = "BNZ"

In [21]:
df.dtypes

Lookup Value    object
Coben           object
2025            object
2026            object
2027            object
2028            object
2029            object
2030            object
2031            object
2032            object
2033            object
2034            object
2035            object
2036            object
2037            object
2038            object
2039            object
2040            object
2041            object
2042            object
2043            object
2044            object
2045            object
2046            object
2047            object
2048            object
2049            object
2050            object
scenario        object
dtype: object

In [22]:
# Convert columns with names from 2025 to 2050 to float
year_columns = [str(year) for year in range(2025, 2051)]
df[year_columns] = df[year_columns].astype(np.float32)

In [23]:
df.dtypes

Lookup Value     object
Coben            object
2025            float32
2026            float32
2027            float32
2028            float32
2029            float32
2030            float32
2031            float32
2032            float32
2033            float32
2034            float32
2035            float32
2036            float32
2037            float32
2038            float32
2039            float32
2040            float32
2041            float32
2042            float32
2043            float32
2044            float32
2045            float32
2046            float32
2047            float32
2048            float32
2049            float32
2050            float32
scenario         object
dtype: object

In [24]:
# Create total column
# df["total (£m)"] = df[ [ f'{i} (£m)' for i in range(2025, 2051)]].sum(axis=1)
df["total (£m)"] = df[ [ f'{i}' for i in range(2025, 2051)]].sum(axis=1)

In [25]:
# Rename columns so it does not contain special characters (not needed anymore I think)
df.columns = df.columns.str.replace(' (£m)', '')

In [26]:
# Rename columns: replace spaces with underscores
df.columns = df.columns.str.replace(' ', '_')

In [27]:
df.head()

Unnamed: 0,Lookup_Value,Coben,2025,2026,2027,2028,2029,2030,2031,2032,...,2043,2044,2045,2046,2047,2048,2049,2050,scenario,total
0,N20000001,Air quality,0.003495,0.004546,0.005847,0.007228,0.008179,0.008061,0.008427,0.008942,...,0.010453,0.010551,0.010555,0.010573,0.010527,0.010476,0.010389,0.010268,BNZ,0.237583
1,N20000001,Noise,0.001345,0.002123,0.003191,0.003107,0.003973,0.006073,0.006876,0.006949,...,0.0104,0.010248,0.010086,0.009848,0.009746,0.009516,0.009293,0.00916,BNZ,0.207463
2,N20000001,Congestion,0.002332,0.002218,0.00203,0.001956,0.001207,0.000835,0.00133,0.001141,...,0.006464,0.007158,0.009144,0.009932,0.010757,0.011624,0.012209,0.015007,BNZ,0.122029
3,N20000001,Road repairs,0.002734,0.002986,0.003332,0.0035,0.003507,0.003968,0.003941,0.00357,...,0.002681,0.00253,0.002736,0.002624,0.002646,0.002455,0.002444,0.002503,BNZ,0.079201
4,N20000001,Road safety,0.000656,0.000484,0.000399,0.000309,2.2e-05,-8.9e-05,-0.000157,-0.000281,...,0.002021,0.002387,0.003246,0.003709,0.004182,0.004577,0.005,0.00587,BNZ,0.036578


In [28]:
df.total

0         2.375832e-01
1         2.074626e-01
2         1.220294e-01
3         7.920101e-02
4         3.657842e-02
              ...     
557107    1.804824e-01
557108    9.130131e-07
557109    4.796808e-03
557110    6.356870e-01
557111    3.583661e+00
Name: total, Length: 554641, dtype: float32

In [29]:
np.max(df.total)

149.6426239013672

In [30]:
np.mean(df.total)

0.93391794

In [31]:
df.dtypes

Lookup_Value     object
Coben            object
2025            float32
2026            float32
2027            float32
2028            float32
2029            float32
2030            float32
2031            float32
2032            float32
2033            float32
2034            float32
2035            float32
2036            float32
2037            float32
2038            float32
2039            float32
2040            float32
2041            float32
2042            float32
2043            float32
2044            float32
2045            float32
2046            float32
2047            float32
2048            float32
2049            float32
2050            float32
scenario         object
total           float32
dtype: object

## Join SE factors and cobenefs

In [32]:
df = pd.merge(df, df_socio, left_on='Lookup_Value', right_on='LSOA_DZ_CD', how='inner')

In [33]:
df.head()

Unnamed: 0,Lookup_Value,Coben,2025,2026,2027,2028,2029,2030,2031,2032,...,Rurality,House_value,Fuel_Type,Fuel_consumption_total,Floor_area,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm
0,N20000001,Air quality,0.003495,0.004546,0.005847,0.007228,0.008179,0.008061,0.008427,0.008942,...,2,137688,3,37665,180,0,2,414,12923,2584
1,N20000001,Noise,0.001345,0.002123,0.003191,0.003107,0.003973,0.006073,0.006876,0.006949,...,2,137688,3,37665,180,0,2,414,12923,2584
2,N20000001,Congestion,0.002332,0.002218,0.00203,0.001956,0.001207,0.000835,0.00133,0.001141,...,2,137688,3,37665,180,0,2,414,12923,2584
3,N20000001,Road repairs,0.002734,0.002986,0.003332,0.0035,0.003507,0.003968,0.003941,0.00357,...,2,137688,3,37665,180,0,2,414,12923,2584
4,N20000001,Road safety,0.000656,0.000484,0.000399,0.000309,2.2e-05,-8.9e-05,-0.000157,-0.000281,...,2,137688,3,37665,180,0,2,414,12923,2584


In [34]:
# Rename cobenef to always keep the same name (changed depending the version)
df.rename(columns={'Coben': 'co_benefit_type'}, inplace=True)

## Aggregating the time

In [35]:
# Number of years merging
time_step = 5

In [36]:
years = list(range( 2025, 2051 ))
#years

In [37]:
len(years)

26

In [38]:
df[["2025", "2026"]]

Unnamed: 0,2025,2026
0,3.495246e-03,4.545631e-03
1,1.345039e-03,2.122763e-03
2,2.332261e-03,2.217988e-03
3,2.733760e-03,2.985896e-03
4,6.563730e-04,4.837550e-04
...,...,...
556117,1.039560e-04,1.210047e-03
556118,7.523670e-10,-1.061550e-08
556119,6.166430e-08,3.779780e-06
556120,1.951576e-02,3.200329e-02


In [39]:
#  AGGREGATE TIME: DISABLED FOR NOW
if True:
    for i in range(0, len(years) - ( time_step - 1), time_step):
        window_years = [str(year) for year in years[i:i+5]]
        print(window_years)
        window_sum = df[window_years].sum(axis=1)
        df[f'Y{window_years[0]}_{window_years[-1]}'] = window_sum
        # df[f'{window_years[0]}_{window_years[-1]}'] = window_sum

    # Delete single values columns for space
    df = df.drop(columns=[str(year) for year in years])

['2025', '2026', '2027', '2028', '2029']
['2030', '2031', '2032', '2033', '2034']
['2035', '2036', '2037', '2038', '2039']
['2040', '2041', '2042', '2043', '2044']
['2045', '2046', '2047', '2048', '2049']


In [40]:
# df[ ['2025', '2026', '2027', '2028', '2029', 'Y2025_2029'] ]

In [41]:
# CONVERT INT64 into int32
df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)

In [42]:
df

Unnamed: 0,Lookup_Value,co_benefit_type,scenario,total,Nation,LAD,MSOA,LSOA_DZ_CD,LSOA_DZ_NM,Under_35,...,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm,Y2025_2029,Y2030_2034,Y2035_2039,Y2040_2044,Y2045_2049
0,N20000001,Air quality,BNZ,2.375832e-01,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,2.929422e-02,4.477071e-02,4.903598e-02,5.169436e-02,5.252027e-02
1,N20000001,Noise,BNZ,2.074626e-01,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,1.373900e-02,3.627460e-02,4.907896e-02,5.072070e-02,4.848914e-02
2,N20000001,Congestion,BNZ,1.220294e-01,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,9.742812e-03,5.053860e-03,1.022713e-02,2.833174e-02,5.366638e-02
3,N20000001,Road repairs,BNZ,7.920101e-02,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,1.605823e-02,1.818745e-02,1.550715e-02,1.404074e-02,1.290418e-02
4,N20000001,Road safety,BNZ,3.657842e-02,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,1.870352e-03,-1.044911e-03,7.986065e-04,8.369693e-03,2.071488e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556117,S01013481,Excess cold,BNZ,1.804824e-01,Scotland,S12000040,-,S01013481,Broxburn East - 04,0.45,...,1,2,298,18643,12357,4.330068e-03,1.975439e-02,3.440068e-02,4.724859e-02,6.193027e-02
556118,S01013481,Excess heat,BNZ,9.130131e-07,Scotland,S12000040,-,S01013481,Broxburn East - 04,0.45,...,1,2,298,18643,12357,-7.223389e-08,-1.200843e-07,1.415980e-07,3.804091e-07,4.100572e-07
556119,S01013481,Dampness,BNZ,4.796808e-03,Scotland,S12000040,-,S01013481,Broxburn East - 04,0.45,...,1,2,298,18643,12357,1.625220e-04,6.381780e-04,1.083056e-03,1.257128e-03,1.385116e-03
556120,S01013481,Diet change,BNZ,6.356870e-01,Scotland,S12000040,-,S01013481,Broxburn East - 04,0.45,...,1,2,298,18643,12357,1.784100e-01,1.165155e-01,1.083907e-01,1.087795e-01,1.092549e-01


In [43]:
set(df.EPC)

{1, 2, 3, 4, 5, 6, 7, <NA>}

In [44]:
df.dtypes

Lookup_Value                      object
co_benefit_type                   object
scenario                          object
total                            float32
Nation                    string[python]
LAD                       string[python]
MSOA                      string[python]
LSOA_DZ_CD                string[python]
LSOA_DZ_NM                string[python]
Under_35                         Float64
Over_65                          Float64
EPC                                Int16
Median_Income                      int32
Tenure                             int32
Typology                           int32
Unemployment                     Float64
Rurality                           int32
House_value                        int32
Fuel_Type                          int32
Fuel_consumption_total             int32
Floor_area                         int32
Gas_flag                           Int16
Number_cars                        int32
Urban_trips                        int32
Total_vkm       

In [60]:
df[(df["co_benefit_type"] == "Noise") & (df["Lookup_Value"] == "N20000001")]

Unnamed: 0,Lookup_Value,co_benefit_type,scenario,total,Nation,LAD,MSOA,LSOA_DZ_CD,LSOA_DZ_NM,Under_35,...,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm,Y2025_2029,Y2030_2034,Y2035_2039,Y2040_2044,Y2045_2049
1,N20000001,Noise,BNZ,0.207463,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,0.013739,0.036275,0.049079,0.050721,0.048489


In [61]:
df

Unnamed: 0,Lookup_Value,co_benefit_type,scenario,total,Nation,LAD,MSOA,LSOA_DZ_CD,LSOA_DZ_NM,Under_35,...,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm,Y2025_2029,Y2030_2034,Y2035_2039,Y2040_2044,Y2045_2049
0,N20000001,Air quality,BNZ,2.375832e-01,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,2.929422e-02,4.477071e-02,4.903598e-02,5.169436e-02,5.252027e-02
1,N20000001,Noise,BNZ,2.074626e-01,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,1.373900e-02,3.627460e-02,4.907896e-02,5.072070e-02,4.848914e-02
2,N20000001,Congestion,BNZ,1.220294e-01,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,9.742812e-03,5.053860e-03,1.022713e-02,2.833174e-02,5.366638e-02
3,N20000001,Road repairs,BNZ,7.920101e-02,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,1.605823e-02,1.818745e-02,1.550715e-02,1.404074e-02,1.290418e-02
4,N20000001,Road safety,BNZ,3.657842e-02,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,...,0,2,414,12923,2584,1.870352e-03,-1.044911e-03,7.986065e-04,8.369693e-03,2.071488e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556117,S01013481,Excess cold,BNZ,1.804824e-01,Scotland,S12000040,-,S01013481,Broxburn East - 04,0.45,...,1,2,298,18643,12357,4.330068e-03,1.975439e-02,3.440068e-02,4.724859e-02,6.193027e-02
556118,S01013481,Excess heat,BNZ,9.130131e-07,Scotland,S12000040,-,S01013481,Broxburn East - 04,0.45,...,1,2,298,18643,12357,-7.223389e-08,-1.200843e-07,1.415980e-07,3.804091e-07,4.100572e-07
556119,S01013481,Dampness,BNZ,4.796808e-03,Scotland,S12000040,-,S01013481,Broxburn East - 04,0.45,...,1,2,298,18643,12357,1.625220e-04,6.381780e-04,1.083056e-03,1.257128e-03,1.385116e-03
556120,S01013481,Diet change,BNZ,6.356870e-01,Scotland,S12000040,-,S01013481,Broxburn East - 04,0.45,...,1,2,298,18643,12357,1.784100e-01,1.165155e-01,1.083907e-01,1.087795e-01,1.092549e-01


## Add the number of households (we have the values in another old table)

In [72]:
df_nb_households = pd.read_csv(data_folder + "simulations_new/BNZ.csv")

In [74]:
len(df_nb_households)

557112

In [79]:
df_for_merge = df_nb_households[['Lookup_Value', 'HHs']]
df_for_merge = df_for_merge.drop_duplicates('Lookup_Value')

In [84]:
df = df.merge(df_nb_households[['Lookup_Value', 'HHs']].drop_duplicates('Lookup_Value'), on='Lookup_Value', how='inner')

In [87]:
len(df)

556122

## Export table as parquet file

In [88]:
df.to_parquet('static/database.parquet')

In [197]:
# Not needed anymore, everything is in the same table
# df_socio.to_parquet('static/tableSocio.parquet')

In [89]:
df.columns

Index(['Lookup_Value', 'co_benefit_type', 'scenario', 'total', 'Nation', 'LAD',
       'MSOA', 'LSOA_DZ_CD', 'LSOA_DZ_NM', 'Under_35', 'Over_65', 'EPC',
       'Median_Income', 'Tenure', 'Typology', 'Unemployment', 'Rurality',
       'House_value', 'Fuel_Type', 'Fuel_consumption_total', 'Floor_area',
       'Gas_flag', 'Number_cars', 'Urban_trips', 'Total_vkm', 'Urban_vkm',
       'Y2025_2029', 'Y2030_2034', 'Y2035_2039', 'Y2040_2044', 'Y2045_2049',
       'HHs'],
      dtype='object')

In [91]:
set(df.Nation)

{'Eng/Wales', 'NI', 'Scotland'}

In [133]:
df.co_benefit_type

0          Air quality
1                Noise
2           Congestion
3         Road repairs
4          Road safety
              ...     
556117     Excess cold
556118     Excess heat
556119        Dampness
556120     Diet change
556121           Total
Name: co_benefit_type, Length: 556122, dtype: object

## Export subset for faster development 

In [122]:
df

Unnamed: 0,Lookup_Value,co_benefit_type,2025,2026,2027,2028,2029,2030,2031,2032,...,Rurality,House_value,Fuel_Type,Fuel_consumption_total,Floor_area,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm
0,N20000001,Air quality,3.495246e-03,4.545631e-03,5.846987e-03,7.227724e-03,8.178627e-03,8.060945e-03,8.426776e-03,8.942410e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
1,N20000001,Noise,1.345039e-03,2.122763e-03,3.191027e-03,3.107497e-03,3.972673e-03,6.072646e-03,6.876487e-03,6.949075e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
2,N20000001,Congestion,2.332261e-03,2.217988e-03,2.029615e-03,1.955506e-03,1.207442e-03,8.346050e-04,1.329813e-03,1.140636e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
3,N20000001,Road repairs,2.733760e-03,2.985896e-03,3.331585e-03,3.500383e-03,3.506608e-03,3.967531e-03,3.940610e-03,3.570292e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
4,N20000001,Road safety,6.563730e-04,4.837550e-04,3.990420e-04,3.094560e-04,2.172590e-05,-8.859600e-05,-1.572280e-04,-2.813460e-04,...,2,137688,3,37665,180,0,2,414,12923,2584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2780605,S01013481,Excess cold,1.026941e-04,7.528737e-04,5.563542e-04,1.147938e-03,2.770141e-03,1.206036e-03,2.316936e-03,3.127054e-03,...,1,252995,1,24480,96,1,2,298,18643,12357
2780606,S01013481,Excess heat,1.693859e-09,-1.384168e-08,1.343553e-08,-1.013713e-07,-2.880965e-08,-1.337351e-07,1.206930e-08,1.857267e-08,...,1,252995,1,24480,96,1,2,298,18643,12357
2780607,S01013481,Dampness,3.834547e-05,3.763206e-05,1.010562e-05,6.931372e-05,-4.717624e-05,3.616986e-04,7.791065e-06,2.348168e-04,...,1,252995,1,24480,96,1,2,298,18643,12357
2780608,S01013481,Diet change,3.055232e-02,2.467668e-02,6.447578e-02,5.904762e-02,4.999083e-02,6.013978e-02,1.979823e-02,3.186790e-02,...,1,252995,1,24480,96,1,2,298,18643,12357


In [123]:
df.columns

Index(['Lookup_Value', 'co_benefit_type', '2025', '2026', '2027', '2028',
       '2029', '2030', '2031', '2032', '2033', '2034', '2035', '2036', '2037',
       '2038', '2039', '2040', '2041', '2042', '2043', '2044', '2045', '2046',
       '2047', '2048', '2049', '2050', 'scenario', 'total', 'Nation', 'LAD',
       'MSOA', 'LSOA_DZ_CD', 'LSOA_DZ_NM', 'Under_35', 'Over_65', 'EPC',
       'Median_Income', 'Tenure', 'Typology', 'Unemployment', 'Rurality',
       'House_value', 'Fuel_Type', 'Fuel_consumption_total', 'Floor_area',
       'Gas_flag', 'Number_cars', 'Urban_trips', 'Total_vkm', 'Urban_vkm'],
      dtype='object')

In [124]:
# Assuming df is your DataFrame and 'column_name' is the column you're interested in
# df_sampled = df.groupby('LAD').apply(lambda x: x.sample(n=10)).reset_index(drop=True)
df_sampled = df[df.Nation == "NI"]

In [125]:
df_sampled

Unnamed: 0,Lookup_Value,co_benefit_type,2025,2026,2027,2028,2029,2030,2031,2032,...,Rurality,House_value,Fuel_Type,Fuel_consumption_total,Floor_area,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm
0,N20000001,Air quality,3.495246e-03,4.545631e-03,5.846987e-03,7.227724e-03,8.178627e-03,8.060945e-03,8.426776e-03,8.942410e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
1,N20000001,Noise,1.345039e-03,2.122763e-03,3.191027e-03,3.107497e-03,3.972673e-03,6.072646e-03,6.876487e-03,6.949075e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
2,N20000001,Congestion,2.332261e-03,2.217988e-03,2.029615e-03,1.955506e-03,1.207442e-03,8.346050e-04,1.329813e-03,1.140636e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
3,N20000001,Road repairs,2.733760e-03,2.985896e-03,3.331585e-03,3.500383e-03,3.506608e-03,3.967531e-03,3.940610e-03,3.570292e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
4,N20000001,Road safety,6.563730e-04,4.837550e-04,3.990420e-04,3.094560e-04,2.172590e-05,-8.859600e-05,-1.572280e-04,-2.813460e-04,...,2,137688,3,37665,180,0,2,414,12923,2584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267857,N20003780,Excess cold,9.926216e-05,2.023556e-03,8.575203e-04,-1.099166e-04,-2.065125e-03,4.147600e-03,-1.987975e-04,1.765109e-03,...,2,161985,3,26394,106,0,2,395,12334,2467
2267858,N20003780,Excess heat,-6.201756e-09,-3.063379e-08,-3.561648e-08,-5.411495e-08,-1.781081e-08,-6.124070e-08,-8.144563e-08,-2.712894e-08,...,2,161985,3,26394,106,0,2,395,12334,2467
2267859,N20003780,Dampness,7.290739e-05,-3.292369e-06,-7.793160e-05,2.476983e-04,2.503835e-04,4.885546e-04,1.499853e-04,4.278001e-04,...,2,161985,3,26394,106,0,2,395,12334,2467
2267860,N20003780,Diet change,3.015957e-03,-9.118543e-03,1.803454e-02,1.526367e-02,-3.046252e-02,2.232430e-02,8.573382e-03,6.869538e-03,...,2,161985,3,26394,106,0,2,395,12334,2467


In [126]:
df_sampled[(df_sampled["Lookup_Value"] == "N20000001") & (df_sampled["co_benefit_type"] == "Dampness")][["2028", "scenario"]]

Unnamed: 0,2028,scenario
9,7.3e-05,BNZ
556131,-0.000132,Engagement
1112253,0.000131,Tailwinds
1668375,0.000234,Headwinds
2224497,1.2e-05,Innovation


In [127]:
df_sampled.to_parquet('static/database_onlyIreland.parquet')