In [278]:
import pandas as pd
import numpy as np
import duckdb

In [279]:
import warnings
warnings.filterwarnings('ignore')

In [280]:
data_folder = "staticNotDeployed/"
static_folder = "static/"

In [281]:
# scenario_BNZ_path = "simulations_new/BNZ.csv"
# scenario_holder_path = "simulations_new/BNZ.csv"

# New data march 2025
scenario_BNZ_path = data_folder + "simulations_new/march2025/BNZ_£millions_annualy.csv"
scenario_holder_path = data_folder + "simulations_new/march2025/BNZ_£millions_annualy.csv"

socio_factors_path = "UK_Archetypes_global_measures.csv"

scenario_paths = [scenario_BNZ_path, scenario_holder_path]
scenario_names = ["BNZ", "Engagement", "Tailwinds", "Headwinds", "Innovation"]

## Create pandas tables

In [282]:
df_socio = pd.read_csv(static_folder+socio_factors_path)

In [283]:
# Remove one row on NaN
df_socio = df_socio.dropna()

In [284]:
df_socio = df_socio.convert_dtypes()

In [285]:
df_socio

Unnamed: 0,Nation,LAD,MSOA,LSOA.DZ.CD,LSOA.DZ.NM,Under.35,Over.65,EPC,Median.Income,Tenure,...,Rurality,House.value,Fuel.Type,Fuel.consumption.total,Floor.area,Gas.flag,Number.cars,Urban.trips,Total.vkm,Urban.vkm
0,NI,N09000001,-,N20000001,Dunsilly_A1,0.75,0.25,4,23318,1,...,2,137688,3,37665,180,0,2,414,12923,2584
1,NI,N09000001,-,N20000002,Dunsilly_B1,0.65,0.35,4,23318,1,...,2,149950,3,35381,167,0,2,376,11730,2346
2,NI,N09000001,-,N20000003,Dunsilly_A2,0.66,0.34,4,23318,1,...,2,137688,3,32461,143,0,2,369,11504,2301
3,NI,N09000001,-,N20000004,Dunsilly_A3,0.73,0.27,5,23318,1,...,2,137688,3,30303,126,0,2,387,12075,2415
4,NI,N09000001,-,N20000005,Dunsilly_B2,0.7,0.3,4,23318,1,...,2,134250,3,35166,182,0,2,408,12719,2544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46547,Scotland,S12000040,-,S01013477,Broxburn South - 06,0.34,0.23,3.0,23951,1,...,1,175000,1,31622,97,1.0,2,392,15827,10491
46548,Scotland,S12000040,-,S01013478,Broxburn East - 01,0.42,0.14,3.0,34382,1,...,1,130000,1,12784,68,1.0,2,699,15224,10091
46549,Scotland,S12000040,-,S01013479,Broxburn East - 02,0.34,0.26,3.0,21767,1,...,1,72500,1,15801,69,1.0,2,461,14259,9452
46550,Scotland,S12000040,-,S01013480,Broxburn East - 03,0.4,0.23,3.0,23972,1,...,1,72500,1,17025,75,1.0,2,360,13406,8886


In [286]:
df_socio.rename(columns=lambda x: x.replace('.', '_'), inplace=True)

In [287]:
df_socio.columns

Index(['Nation', 'LAD', 'MSOA', 'LSOA_DZ_CD', 'LSOA_DZ_NM', 'Under_35',
       'Over_65', 'EPC', 'Median_Income', 'Tenure', 'Typology', 'Unemployment',
       'Rurality', 'House_value', 'Fuel_Type', 'Fuel_consumption_total',
       'Floor_area', 'Gas_flag', 'Number_cars', 'Urban_trips', 'Total_vkm',
       'Urban_vkm'],
      dtype='object')

In [288]:
# EPC and gas_flag have weird values
df_socio.dtypes

Nation                    string[python]
LAD                       string[python]
MSOA                      string[python]
LSOA_DZ_CD                string[python]
LSOA_DZ_NM                string[python]
Under_35                         Float64
Over_65                          Float64
EPC                               object
Median_Income                      Int64
Tenure                             Int64
Typology                           Int64
Unemployment                     Float64
Rurality                           Int64
House_value                        Int64
Fuel_Type                          Int64
Fuel_consumption_total             Int64
Floor_area                         Int64
Gas_flag                          object
Number_cars                        Int64
Urban_trips                        Int64
Total_vkm                          Int64
Urban_vkm                          Int64
dtype: object

In [289]:
set(df_socio.EPC)

{1.0, '2', 2.0, '3', 3.0, '4', 4.0, '5', 5.0, '6', 6.0, '7', 7.0, 'd'}

In [290]:
set(df_socio.Gas_flag)

{'0', 0.0, '1', 1.0, 'Y'}

In [291]:
# Replace the string 'unknown' with pd.NA
df_socio['Gas_flag'] = df_socio['Gas_flag'].replace('Y', pd.NA)

# Step 2: Convert to numeric (converts '1.0' to 1.0 as float)
df_socio['Gas_flag'] = pd.to_numeric(df_socio['Gas_flag'], errors='coerce')

df_socio['Gas_flag'] = df_socio['Gas_flag'].astype('Int16')

In [292]:
# Replace the string 'unknown' with pd.NA
df_socio['EPC'] = df_socio['EPC'].replace('d', pd.NA)

# Step 2: Convert to numeric (converts '1.0' to 1.0 as float)
df_socio['EPC'] = pd.to_numeric(df_socio['EPC'], errors='coerce')

df_socio['EPC'] = df_socio['EPC'].astype('Int16')

### Cobenefit table

In [293]:
dfs = []
years = list(range( 2025, 2051 ))
years_col = [str(year) for year in years]

COBENEFS = ["Air quality", "Noise", "Excess cold", "Excess heat", "Dampness", "Congestion", "Hassle costs", "Road repairs", "Road safety", "Physical activity", "Diet change"]


# Function to add noise to each column based on its mean and std
def add_noise(column, year):

    # print(column)

    # Apply noise on a year, given ONE cobenef at a time
    for cobenef in COBENEFS:
        values = column[column["Coben"] == cobenef][year]
        # print(values)
        mean = values.mean()
        std = values.std()
        # print(mean, std, np.random.normal(mean, std, size=values.shape))

        # print(2222)
        # print(column[column["Coben"] == cobenef][year])
        
        noise = np.random.normal(mean, std, size=values.shape)
        # column[column["Coben"] == cobenef][year] = column[column["Coben"] == cobenef][year] + noise
        # column[column["Coben"] == cobenef][year] = column[column["Coben"] == cobenef][year]
        column.loc[column["Coben"] == cobenef, year] += noise
        
        # print(column[column["Coben"] == cobenef][year] + noise)
        # print(column[column["Coben"] == cobenef][year])
        
    
    # mean = column.mean()
    # std = column.std()
    # noise = np.random.normal(mean, std, size=column.shape)
    # new_col = column + noise
    # print(23, new_col)
    # return new_col

    return column
    


for i, scenario in enumerate(scenario_names):
    # if i == 2:
        # break

    # GENERATE FAKE DATA FOR NOW FOR ALL SCENATIOS EXCEPT BNZ
    if (scenario == "BNZ"):
        df_one_scenario = pd.read_csv(scenario_paths[i])
    else:
        df_one_scenario = pd.read_csv(scenario_paths[0])

        # df_one_scenario[[str(y) for y in years]] = df_one_scenario[[str(y) for y in years]]

        # Remove weird values
        df_one_scenario = df_one_scenario[~df_one_scenario.isin(['#DIV/0!']).any(axis=1)]
        
        # Convert columns with names from 2025 to 2050 to float
        df_one_scenario[years_col] = df_one_scenario[years_col].astype(np.float32)
        
        
        # Apply the function to each specified column
        for year in years:
            # df[str(year)] = df[str(year)](lambda col: add_noise(col), axis=0)
            # df[str(year)] = df.apply(lambda x: add_noise(x[str(year)]), axis=0)
            df_one_scenario[[str(year), "Coben"]] = add_noise(df_one_scenario[[str(year), "Coben"]], str(year))

            # print(1)
            # print(df_one_scenario[[str(year), "Coben"]])
            # print(add_noise(df_one_scenario[[str(year), "Coben"]], str(year)))
            

    df_one_scenario["scenario"] = scenario_names[i]
    dfs.append(df_one_scenario)
    
df = pd.concat(dfs, axis=0)
del dfs 

In [334]:
# df
# df[ [str(y) for y in years] ]
# df[(df["Lookup Value"] == "N20000001") & (df["Coben"] == "Noise")]
# df[(df["Lookup_Value"] == "N20000001") & (df["co_benefit_type"] == "Dampness")]
# df[(df["Lookup_Value"] == "N20000001") & (df["co_benefit_type"] == "Dampness")][["2028", "scenario"]]


# df.columns


In [295]:
# Step: Drop rows where any column contains the value '#DIV/0!'
# There are rows with these weird values in the new dataset
df = df[~df.isin(['#DIV/0!']).any(axis=1)]

In [297]:
df.dtypes

Lookup Value    object
Coben           object
2025            object
2026            object
2027            object
2028            object
2029            object
2030            object
2031            object
2032            object
2033            object
2034            object
2035            object
2036            object
2037            object
2038            object
2039            object
2040            object
2041            object
2042            object
2043            object
2044            object
2045            object
2046            object
2047            object
2048            object
2049            object
2050            object
scenario        object
dtype: object

In [298]:
# Convert columns with names from 2025 to 2050 to float
year_columns = [str(year) for year in range(2025, 2051)]
df[year_columns] = df[year_columns].astype(np.float32)

In [299]:
df.dtypes

Lookup Value     object
Coben            object
2025            float32
2026            float32
2027            float32
2028            float32
2029            float32
2030            float32
2031            float32
2032            float32
2033            float32
2034            float32
2035            float32
2036            float32
2037            float32
2038            float32
2039            float32
2040            float32
2041            float32
2042            float32
2043            float32
2044            float32
2045            float32
2046            float32
2047            float32
2048            float32
2049            float32
2050            float32
scenario         object
dtype: object

In [300]:
# Create total column
# df["total (£m)"] = df[ [ f'{i} (£m)' for i in range(2025, 2051)]].sum(axis=1)
df["total (£m)"] = df[ [ f'{i}' for i in range(2025, 2051)]].sum(axis=1)

In [301]:
# Delete all the columns per hh, only keep total value columns
# Not needed with new data
# df = df.drop(columns=[ f'{i} (£/hh)' for i in range(2025, 2051)])

In [302]:
# Rename columns so it does not contain special characters (not needed anymore I think)
df.columns = df.columns.str.replace(' (£m)', '')

In [303]:
# Rename columns: replace spaces with underscores
df.columns = df.columns.str.replace(' ', '_')

In [304]:
df.head()

Unnamed: 0,Lookup_Value,Coben,2025,2026,2027,2028,2029,2030,2031,2032,...,2043,2044,2045,2046,2047,2048,2049,2050,scenario,total
0,N20000001,Air quality,0.003495,0.004546,0.005847,0.007228,0.008179,0.008061,0.008427,0.008942,...,0.010453,0.010551,0.010555,0.010573,0.010527,0.010476,0.010389,0.010268,BNZ,0.237583
1,N20000001,Noise,0.001345,0.002123,0.003191,0.003107,0.003973,0.006073,0.006876,0.006949,...,0.0104,0.010248,0.010086,0.009848,0.009746,0.009516,0.009293,0.00916,BNZ,0.207463
2,N20000001,Congestion,0.002332,0.002218,0.00203,0.001956,0.001207,0.000835,0.00133,0.001141,...,0.006464,0.007158,0.009144,0.009932,0.010757,0.011624,0.012209,0.015007,BNZ,0.122029
3,N20000001,Road repairs,0.002734,0.002986,0.003332,0.0035,0.003507,0.003968,0.003941,0.00357,...,0.002681,0.00253,0.002736,0.002624,0.002646,0.002455,0.002444,0.002503,BNZ,0.079201
4,N20000001,Road safety,0.000656,0.000484,0.000399,0.000309,2.2e-05,-8.9e-05,-0.000157,-0.000281,...,0.002021,0.002387,0.003246,0.003709,0.004182,0.004577,0.005,0.00587,BNZ,0.036578


In [259]:
# df.astype({'total': 'float'})

In [305]:
df.total

0         0.237583
1         0.207463
2         0.122029
3         0.079201
4         0.036578
            ...   
557107    0.393005
557108    0.000002
557109    0.017044
557110    1.497330
557111    3.583661
Name: total, Length: 2773205, dtype: float32

In [306]:
np.min(df.total)

-8.866961479187012

In [307]:
np.max(df.total)

149.6426239013672

In [308]:
df.dtypes

Lookup_Value     object
Coben            object
2025            float32
2026            float32
2027            float32
2028            float32
2029            float32
2030            float32
2031            float32
2032            float32
2033            float32
2034            float32
2035            float32
2036            float32
2037            float32
2038            float32
2039            float32
2040            float32
2041            float32
2042            float32
2043            float32
2044            float32
2045            float32
2046            float32
2047            float32
2048            float32
2049            float32
2050            float32
scenario         object
total           float32
dtype: object

## Join SE factors and cobenefs

In [309]:
df = pd.merge(df, df_socio, left_on='Lookup_Value', right_on='LSOA_DZ_CD', how='inner')

In [310]:
df.head()

Unnamed: 0,Lookup_Value,Coben,2025,2026,2027,2028,2029,2030,2031,2032,...,Rurality,House_value,Fuel_Type,Fuel_consumption_total,Floor_area,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm
0,N20000001,Air quality,0.003495,0.004546,0.005847,0.007228,0.008179,0.008061,0.008427,0.008942,...,2,137688,3,37665,180,0,2,414,12923,2584
1,N20000001,Noise,0.001345,0.002123,0.003191,0.003107,0.003973,0.006073,0.006876,0.006949,...,2,137688,3,37665,180,0,2,414,12923,2584
2,N20000001,Congestion,0.002332,0.002218,0.00203,0.001956,0.001207,0.000835,0.00133,0.001141,...,2,137688,3,37665,180,0,2,414,12923,2584
3,N20000001,Road repairs,0.002734,0.002986,0.003332,0.0035,0.003507,0.003968,0.003941,0.00357,...,2,137688,3,37665,180,0,2,414,12923,2584
4,N20000001,Road safety,0.000656,0.000484,0.000399,0.000309,2.2e-05,-8.9e-05,-0.000157,-0.000281,...,2,137688,3,37665,180,0,2,414,12923,2584


In [311]:
# Rename cobenef to always keep the same name (changed depending the version)
df.rename(columns={'Coben': 'co_benefit_type'}, inplace=True)

## Aggregating the time

In [312]:
# Number of years merging
time_step = 5

In [313]:
years = list(range( 2025, 2051 ))
#years

In [314]:
df[["2025", "2026"]]

Unnamed: 0,2025,2026
0,3.495246e-03,4.545631e-03
1,1.345039e-03,2.122763e-03
2,2.332261e-03,2.217988e-03
3,2.733760e-03,2.985896e-03
4,6.563730e-04,4.837550e-04
...,...,...
2780605,8.426204e-05,6.440031e-03
2780606,-2.008345e-08,-5.038381e-08
2780607,-6.312062e-05,3.131020e-05
2780608,4.313012e-02,9.336156e-02


In [335]:
#  AGGREGATE TIME: DISABLED FOR NOW
if False:
    for i in range(0, len(years) - ( time_step - 1), time_step):
        window_years = [str(year) for year in years[i:i+5]]
        print(window_years)
        window_sum = df[window_years].sum(axis=1)
        df[f'Y{window_years[0]}_{window_years[-1]}'] = window_sum

In [271]:
df[ ['2025', '2026', '2027', '2028', '2029', 'Y2025_2029'] ]

KeyError: "['Y2025_2029'] not in index"

In [316]:
# Delete single values columns for space
# df = df.drop(columns=[str(year) for year in years])

In [336]:
# CONVERT INT64 into int32
df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)

In [337]:
df

Unnamed: 0,Lookup_Value,co_benefit_type,2025,2026,2027,2028,2029,2030,2031,2032,...,Rurality,House_value,Fuel_Type,Fuel_consumption_total,Floor_area,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm
0,N20000001,Air quality,3.495246e-03,4.545631e-03,5.846987e-03,7.227724e-03,8.178627e-03,8.060945e-03,8.426776e-03,8.942410e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
1,N20000001,Noise,1.345039e-03,2.122763e-03,3.191027e-03,3.107497e-03,3.972673e-03,6.072646e-03,6.876487e-03,6.949075e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
2,N20000001,Congestion,2.332261e-03,2.217988e-03,2.029615e-03,1.955506e-03,1.207442e-03,8.346050e-04,1.329813e-03,1.140636e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
3,N20000001,Road repairs,2.733760e-03,2.985896e-03,3.331585e-03,3.500383e-03,3.506608e-03,3.967531e-03,3.940610e-03,3.570292e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
4,N20000001,Road safety,6.563730e-04,4.837550e-04,3.990420e-04,3.094560e-04,2.172590e-05,-8.859600e-05,-1.572280e-04,-2.813460e-04,...,2,137688,3,37665,180,0,2,414,12923,2584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2780605,S01013481,Excess cold,8.426204e-05,6.440031e-03,2.224516e-03,2.444278e-03,3.323161e-03,9.805587e-03,5.789449e-03,7.165653e-03,...,1,252995,1,24480,96,1,2,298,18643,12357
2780606,S01013481,Excess heat,-2.008345e-08,-5.038381e-08,-4.684858e-08,-8.240556e-09,-7.777948e-08,8.717500e-09,-1.295683e-07,3.247725e-08,...,1,252995,1,24480,96,1,2,298,18643,12357
2780607,S01013481,Dampness,-6.312062e-05,3.131020e-05,1.240558e-04,3.297547e-04,2.025886e-04,3.275296e-04,5.738928e-04,8.006484e-04,...,1,252995,1,24480,96,1,2,298,18643,12357
2780608,S01013481,Diet change,4.313012e-02,9.336156e-02,1.289293e-01,1.649396e-01,6.781635e-02,4.496336e-02,7.265048e-02,5.293809e-02,...,1,252995,1,24480,96,1,2,298,18643,12357


In [338]:
set(df.EPC)

{1, 2, 3, 4, 5, 6, 7, <NA>}

In [340]:
df.dtypes

Lookup_Value                      object
co_benefit_type                   object
2025                             float32
2026                             float32
2027                             float32
2028                             float32
2029                             float32
2030                             float32
2031                             float32
2032                             float32
2033                             float32
2034                             float32
2035                             float32
2036                             float32
2037                             float32
2038                             float32
2039                             float32
2040                             float32
2041                             float32
2042                             float32
2043                             float32
2044                             float32
2045                             float32
2046                             float32
2047            

In [346]:
df[(df["co_benefit_type"] == "Noise") & (df["Lookup_Value"] == "N20000001")]

Unnamed: 0,Lookup_Value,co_benefit_type,2025,2026,2027,2028,2029,2030,2031,2032,...,Rurality,House_value,Fuel_Type,Fuel_consumption_total,Floor_area,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm
1,N20000001,Noise,0.001345,0.002123,0.003191,0.003107,0.003973,0.006073,0.006876,0.006949,...,2,137688,3,37665,180,0,2,414,12923,2584
556123,N20000001,Noise,0.006942,0.006957,0.009987,0.00821,0.014228,0.029493,0.005735,0.000748,...,2,137688,3,37665,180,0,2,414,12923,2584
1112245,N20000001,Noise,0.002032,-0.002073,0.008016,0.010109,0.016213,0.01006,0.010815,0.028415,...,2,137688,3,37665,180,0,2,414,12923,2584
1668367,N20000001,Noise,0.003154,0.013643,0.010363,0.014001,0.009243,0.02242,0.008503,0.021903,...,2,137688,3,37665,180,0,2,414,12923,2584
2224489,N20000001,Noise,-0.003386,0.004788,-0.000688,-0.00965,0.009628,0.025152,0.016175,0.026367,...,2,137688,3,37665,180,0,2,414,12923,2584


## Export table as parquet file

In [353]:
df.to_parquet('static/database.parquet')

In [322]:
# Not needed anymore, everything is in the same table
# df_socio.to_parquet('static/tableSocio.parquet')

## Export subset for faster development 

In [348]:
df

Unnamed: 0,Lookup_Value,co_benefit_type,2025,2026,2027,2028,2029,2030,2031,2032,...,Rurality,House_value,Fuel_Type,Fuel_consumption_total,Floor_area,Gas_flag,Number_cars,Urban_trips,Total_vkm,Urban_vkm
0,N20000001,Air quality,3.495246e-03,4.545631e-03,5.846987e-03,7.227724e-03,8.178627e-03,8.060945e-03,8.426776e-03,8.942410e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
1,N20000001,Noise,1.345039e-03,2.122763e-03,3.191027e-03,3.107497e-03,3.972673e-03,6.072646e-03,6.876487e-03,6.949075e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
2,N20000001,Congestion,2.332261e-03,2.217988e-03,2.029615e-03,1.955506e-03,1.207442e-03,8.346050e-04,1.329813e-03,1.140636e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
3,N20000001,Road repairs,2.733760e-03,2.985896e-03,3.331585e-03,3.500383e-03,3.506608e-03,3.967531e-03,3.940610e-03,3.570292e-03,...,2,137688,3,37665,180,0,2,414,12923,2584
4,N20000001,Road safety,6.563730e-04,4.837550e-04,3.990420e-04,3.094560e-04,2.172590e-05,-8.859600e-05,-1.572280e-04,-2.813460e-04,...,2,137688,3,37665,180,0,2,414,12923,2584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2780605,S01013481,Excess cold,8.426204e-05,6.440031e-03,2.224516e-03,2.444278e-03,3.323161e-03,9.805587e-03,5.789449e-03,7.165653e-03,...,1,252995,1,24480,96,1,2,298,18643,12357
2780606,S01013481,Excess heat,-2.008345e-08,-5.038381e-08,-4.684858e-08,-8.240556e-09,-7.777948e-08,8.717500e-09,-1.295683e-07,3.247725e-08,...,1,252995,1,24480,96,1,2,298,18643,12357
2780607,S01013481,Dampness,-6.312062e-05,3.131020e-05,1.240558e-04,3.297547e-04,2.025886e-04,3.275296e-04,5.738928e-04,8.006484e-04,...,1,252995,1,24480,96,1,2,298,18643,12357
2780608,S01013481,Diet change,4.313012e-02,9.336156e-02,1.289293e-01,1.649396e-01,6.781635e-02,4.496336e-02,7.265048e-02,5.293809e-02,...,1,252995,1,24480,96,1,2,298,18643,12357


In [349]:
df.columns

Index(['Lookup_Value', 'co_benefit_type', '2025', '2026', '2027', '2028',
       '2029', '2030', '2031', '2032', '2033', '2034', '2035', '2036', '2037',
       '2038', '2039', '2040', '2041', '2042', '2043', '2044', '2045', '2046',
       '2047', '2048', '2049', '2050', 'scenario', 'total', 'Nation', 'LAD',
       'MSOA', 'LSOA_DZ_CD', 'LSOA_DZ_NM', 'Under_35', 'Over_65', 'EPC',
       'Median_Income', 'Tenure', 'Typology', 'Unemployment', 'Rurality',
       'House_value', 'Fuel_Type', 'Fuel_consumption_total', 'Floor_area',
       'Gas_flag', 'Number_cars', 'Urban_trips', 'Total_vkm', 'Urban_vkm'],
      dtype='object')

In [355]:
# Assuming df is your DataFrame and 'column_name' is the column you're interested in
# df_sampled = df.groupby('LAD').apply(lambda x: x.sample(n=10)).reset_index(drop=True)
df_sampled = df[df.Nation == "NI"]

In [None]:
df_sampled

In [356]:
df_sampled[(df_sampled["Lookup_Value"] == "N20000001") & (df_sampled["co_benefit_type"] == "Dampness")][["2028", "scenario"]]

Unnamed: 0,2028,scenario
9,7.3e-05,BNZ
556131,0.000243,Engagement
1112253,0.000121,Tailwinds
1668375,0.000215,Headwinds
2224497,0.000352,Innovation


In [357]:
df_sampled.to_parquet('static/database_onlyIreland.parquet')

## Create Duckdb instance

The parquet file is currently used in the frontend as the .duckdb file was causing error. This part is therefore not useful for now

In [42]:
DB_FILE_PATH = 'static/database.duckdb'
TABLE_NAME = "cobenefits"

In [43]:
con = duckdb.connect(DB_FILE_PATH)

In [46]:
# Create table and insert data
con.execute(f"DROP TABLE {TABLE_NAME}")

# Create table and insert data
con.execute(f"CREATE TABLE {TABLE_NAME} AS SELECT * FROM df")

# Verify data
result = con.execute(f"SELECT * FROM {TABLE_NAME} LIMIT 5").fetchall()
print("Sample data:")
print(result)

# Get and print schema
schema = con.execute(f"DESCRIBE {TABLE_NAME}").fetchall()
print("\nTable schema:")
for column in schema:
    print(f"{column[0]}: {column[1]}")

print(f"\nDatabase created and saved to: {DB_FILE_PATH}")

Sample data:
[('N20002754', 'Air quality', 138, 0.001385182, 0.001690344, 0.002178994, 0.00265043, 0.002930518, 0.002957105, 0.003004358, 0.003154786, 0.003284162, 0.003392733, 0.003309062, 0.003258978, 0.003316478, 0.003358668, 0.00338563, 0.003360311, 0.003329033, 0.00331858, 0.003300776, 0.003273695, 0.003256345, 0.003240189, 0.00319424, 0.003143862, 0.003091112, 0.00303487, 'BNZ', 0.078800441), ('N20002754', 'Noise', 138, 0.000150076, 0.000145001, 0.000140098, 0.00013536, 0.000774951, 0.001122295, 0.001103271, 0.001084611, 0.001066307, 0.001048349, 0.001038151, 0.001020742, 0.001003659, 0.000986897, 0.000970447, 0.001206611, 0.001182306, 0.001158578, 0.001135413, 0.001112795, 0.001176065, 0.001151699, 0.00112793, 0.00110474, 0.001082113, 0.00106603, 'BNZ', 0.024294494999999996), ('N20002754', 'Congestion', 138, 0.001159192, 0.001048197, 0.000942343, 0.000892502, 0.000496322, 0.000302559, 0.000561585, 0.000513852, 0.000433189, 0.000542365, 0.001034313, 0.001105454, 0.001294352, 0.00