In [2]:
import pandas as pd
import numpy as np
import duckdb

In [3]:
data_folder = "static/"

In [4]:
scenario_BNZ_path = "simulations_new/BNZ.csv"
scenario_holder_path = "simulations_new/BNZ.csv"

all_scenarios = [scenario_BNZ_path, scenario_holder_path]
scenario_names = ["BNZ", "test"]

## Create pandas tables

In [5]:
dfs = []
for i, scenario in enumerate(all_scenarios):
    df_one_scenario = pd.read_csv(data_folder + scenario)
    df_one_scenario["scenario"] = scenario_names[i]
    dfs.append(df_one_scenario)
df = pd.concat(dfs, axis=0)
del dfs 

In [6]:
# Create total column
df["total (£m)"] = df[ [ f'{i} (£m)' for i in range(2025, 2051)]].sum(axis=1)

In [7]:
# Delete all the columns per hh, only keep total value columns
df = df.drop(columns=[ f'{i} (£/hh)' for i in range(2025, 2051)])

In [8]:
# Convert float64 to float32
df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)

In [9]:
# Rename columns so it does not contain special characters
df.columns = df.columns.str.replace(' (£m)', '')

In [10]:
# Rename columns: replace spaces with underscores
# df.columns = df.columns.str.replace(' ', '_')

In [11]:
df.head()

Unnamed: 0,Lookup_Value,co_benefit_type,HHs,2025,2026,2027,2028,2029,2030,2031,...,2043,2044,2045,2046,2047,2048,2049,2050,scenario,total
0,N20002754,Air quality,138,0.001385,0.00169,0.002179,0.00265,0.002931,0.002957,0.003004,...,0.003301,0.003274,0.003256,0.00324,0.003194,0.003144,0.003091,0.003035,BNZ,0.0788
1,N20002754,Noise,138,0.00015,0.000145,0.00014,0.000135,0.000775,0.001122,0.001103,...,0.001135,0.001113,0.001176,0.001152,0.001128,0.001105,0.001082,0.001066,BNZ,0.024294
2,N20002754,Congestion,138,0.001159,0.001048,0.000942,0.000893,0.000496,0.000303,0.000562,...,0.004094,0.004525,0.005711,0.006177,0.006654,0.007142,0.007495,0.009113,BNZ,0.073969
3,N20002754,Road repairs,138,0.001225,0.001336,0.00149,0.001566,0.001567,0.001773,0.001761,...,0.001218,0.001152,0.001249,0.001201,0.001212,0.001128,0.001125,0.001155,BNZ,0.035724
4,N20002754,Road safety,138,0.000333,0.000236,0.000191,0.000145,-6e-06,-6.5e-05,-8.2e-05,...,0.001388,0.001611,0.002129,0.002397,0.00267,0.002907,0.003154,0.003674,BNZ,0.024474


In [13]:
np.min(df.total)

-8.116110801696777

In [14]:
np.max(df.total)

36.776371002197266

In [81]:
df.dtypes

Lookup_Value        object
co_benefit_type     object
HHs                  int64
2025               float32
2026               float32
2027               float32
2028               float32
2029               float32
2030               float32
2031               float32
2032               float32
2033               float32
2034               float32
2035               float32
2036               float32
2037               float32
2038               float32
2039               float32
2040               float32
2041               float32
2042               float32
2043               float32
2044               float32
2045               float32
2046               float32
2047               float32
2048               float32
2049               float32
2050               float32
scenario            object
total              float32
dtype: object

## Export table as parquet file

In [82]:
df.to_parquet('static/database.parquet')

  if _pandas_api.is_sparse(col):


## Create Duckdb instance

In [42]:
DB_FILE_PATH = 'static/database.duckdb'
TABLE_NAME = "cobenefits"

In [43]:
con = duckdb.connect(DB_FILE_PATH)

In [46]:
# Create table and insert data
con.execute(f"DROP TABLE {TABLE_NAME}")

# Create table and insert data
con.execute(f"CREATE TABLE {TABLE_NAME} AS SELECT * FROM df")

# Verify data
result = con.execute(f"SELECT * FROM {TABLE_NAME} LIMIT 5").fetchall()
print("Sample data:")
print(result)

# Get and print schema
schema = con.execute(f"DESCRIBE {TABLE_NAME}").fetchall()
print("\nTable schema:")
for column in schema:
    print(f"{column[0]}: {column[1]}")

print(f"\nDatabase created and saved to: {DB_FILE_PATH}")

Sample data:
[('N20002754', 'Air quality', 138, 0.001385182, 0.001690344, 0.002178994, 0.00265043, 0.002930518, 0.002957105, 0.003004358, 0.003154786, 0.003284162, 0.003392733, 0.003309062, 0.003258978, 0.003316478, 0.003358668, 0.00338563, 0.003360311, 0.003329033, 0.00331858, 0.003300776, 0.003273695, 0.003256345, 0.003240189, 0.00319424, 0.003143862, 0.003091112, 0.00303487, 'BNZ', 0.078800441), ('N20002754', 'Noise', 138, 0.000150076, 0.000145001, 0.000140098, 0.00013536, 0.000774951, 0.001122295, 0.001103271, 0.001084611, 0.001066307, 0.001048349, 0.001038151, 0.001020742, 0.001003659, 0.000986897, 0.000970447, 0.001206611, 0.001182306, 0.001158578, 0.001135413, 0.001112795, 0.001176065, 0.001151699, 0.00112793, 0.00110474, 0.001082113, 0.00106603, 'BNZ', 0.024294494999999996), ('N20002754', 'Congestion', 138, 0.001159192, 0.001048197, 0.000942343, 0.000892502, 0.000496322, 0.000302559, 0.000561585, 0.000513852, 0.000433189, 0.000542365, 0.001034313, 0.001105454, 0.001294352, 0.00