In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings

# Ignore FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)


pd.options.mode.chained_assignment = None

import sqlite3
from dotenv import load_dotenv

load_dotenv()
import os

DB_PATH = os.getenv("DB_PATH")
DATA_PATH = "data"

conn = sqlite3.connect(DB_PATH)

In [26]:
df_unseen_model = pd.read_csv('../unseen_species_model/results/estimations.csv')
df_unseen_model = df_unseen_model[df_unseen_model['decade']<1870]


df_score_region = pd.read_csv('../immaterial_index/results/df_region_score.csv', index_col=[0])
df_score_region = df_score_region.rename(columns = {'region_name':'region'})

new_df = pd.merge(df_unseen_model, df_score_region, on = ['decade', 'region'])
new_df = new_df.rename(columns = {'region':'region_name', 'decade':'year'})


### Load GDP and Population Data

In [27]:
#df_region_code = pd.read_sql_query("SELECT * FROM region_code", conn)
df_region_code = pd.read_csv("../environnement_data/region_code.csv", index_col = [0])

#df_gdp = pd.read_sql_query("SELECT * FROM gdp", conn)
df_gdp = pd.read_csv("../environnement_data/gdp.csv", index_col=[0])

df_gdp = df_gdp.drop(['country_code_maddison'], axis=1)
df_gdp = df_gdp[df_gdp['year']<=1880]
df_gdp = pd.merge(df_gdp, df_region_code, on = 'region_code')
df_gdp.sample(5)

#df_gdp_clean =  pd.read_sql_query("SELECT * FROM gdp_clean", conn)
df_gdp_clean = pd.read_csv("../environnement_data/gdp_clean.csv", index_col=[0])
df_gdp_clean = pd.merge(df_gdp_clean, df_region_code, on = 'region_code')
df_gdp_clean.to_csv('../environnement_data/gdp_clean_region_name.csv')


In [28]:
#df_population = pd.read_sql_query("SELECT * FROM population", conn)
df_population = pd.read_csv("../environnement_data/population.csv", index_col=[0])

df_population = pd.merge(df_population, df_region_code, on = 'region_code')
df_population = df_population.drop('region_code', axis=1)

In [29]:
def interpolate_function(df, value="value", category="region_code", year="year", size_interpolation = 10):
    df_interpolated_list = []
    for reg in set(df[category]):
        res = df[df[category] == reg]

        min_date = min(res["year"])
        max_date = max(res["year"])

        year_range = np.arange(round(min_date), round(max_date), size_interpolation)
        df_year = pd.DataFrame(year_range, columns=[year]).reset_index(drop=True)

        res = pd.merge(res, df_year, on=year, how="outer")
        res[category] = reg
        res = res.sort_values(year, ascending=True)
        res[f"{value}_interpolated"] = res[value].interpolate(method="linear")
        res = res.reset_index(drop=True)
        res = res[~res[f"{value}_interpolated"].isna()]

        df_interpolated_list.append(res)

    df_interpolated = pd.concat([x for x in df_interpolated_list])
    df_interpolated = df_interpolated.reset_index(drop=True)
    df_interpolated = df_interpolated.drop(value, axis=1)

    return df_interpolated

df_population_interpolation = interpolate_function(df_population, 
                                                   value = 'population', 
                                                   year = 'year', 
                                                   category="region_name",
                                                   size_interpolation = 10)

In [30]:
# Linear interpolation on the proxies in order to merge with the GDP data excatly (that are at the year level)

final = pd.merge(df_population_interpolation, new_df, on = ['region_name', 'year'])
final['score_cap'] = final['score']/final['population_interpolated']
final['lower_cap'] = final['lower']/final['population_interpolated']
final['N_est_cap'] = final['N_est']/final['population_interpolated']
final['upper_cap'] = final['upper']/final['population_interpolated']

data_stats = pd.merge(final, df_gdp, on = ['region_name', 'year'])
data_stats.to_csv(f'results/data_stats.csv')

df_stats_clean = pd.merge(final, df_gdp_clean, on = ['region_name', 'year'])
df_stats_clean.to_csv(f'results/data_stats_clean_gdp.csv')

Unnamed: 0,year,region_name,population_interpolated,N_est,lower,upper,score,century,score_cap,lower_cap,N_est_cap,upper_cap
0,-260.0,Italy,4.796265,9.437108,5.098282,14.245468,1,-2,0.208496,1.062969,1.967595,2.970117
1,-220.0,Italy,5.071080,26.870585,15.002979,40.112783,3,-2,0.591590,2.958537,5.298789,7.910106
2,-210.0,Italy,5.143362,8.856873,5.012380,13.154811,2,-2,0.388851,0.974534,1.722001,2.557629
3,-180.0,Italy,5.367963,17.193206,10.197968,24.992405,3,-1,0.558871,1.899784,3.202929,4.655845
4,-160.0,Italy,5.523511,118.317400,72.671700,168.999975,9,-1,1.629398,13.156794,21.420686,30.596475
...,...,...,...,...,...,...,...,...,...,...,...,...
1854,1820.0,Portugal,2.526287,458.720845,351.104504,577.792302,30,19,11.875136,138.980463,181.579088,228.712081
1855,1830.0,Portugal,2.561148,209.662435,160.900212,262.862827,17,19,6.637647,62.823461,81.862663,102.634748
1856,1840.0,Portugal,2.596040,759.086829,584.045295,947.724407,54,19,20.800915,224.975488,292.401858,365.065454
1857,1850.0,Portugal,2.630958,508.880531,390.083931,635.304758,37,19,14.063316,148.266856,193.420212,241.472749


In [31]:
years = np.arange(-1000, 1881, 1)

print(len(final))

concat_list = []

for region in final.region_name.unique():
    df_years = pd.DataFrame(years, columns = ['year'])
    df_years['region_name'] = region
    
    final_filtered = final[final['region_name']==region]
    mix = pd.merge(df_years, final_filtered, on = ['year', 'region_name'], how = 'outer')
    concat_list.append(mix)
    
new_final = pd.concat(concat_list)
new_final = new_final.sort_values(['region_name', 'year'])

new_final = new_final.set_index('year')
grouped = new_final.groupby('region_name')

interpolated_data = []
for name, group in grouped:
    interpolated_group = group.interpolate(method='linear')
    interpolated_data.append(interpolated_group)

# Concatenate the interpolated data for all regions
interpolated_df = pd.concat(interpolated_data)

# Reset the index to restore the 'year' column
interpolated_df = interpolated_df.reset_index()
interpolated_df = interpolated_df.drop_duplicates().dropna().reset_index(drop=True)

1859


In [32]:
data_stats_interpolated = pd.merge(interpolated_df, df_gdp, on = ['region_name', 'year'])
data_stats_interpolated = data_stats_interpolated.reset_index(drop=True)
data_stats_interpolated.to_csv(f'results/data_stats_interpolated.csv')

data_stats_interpolated_clean_gdp = pd.merge(interpolated_df, df_gdp_clean, on = ['region_name', 'year'])
data_stats_interpolated_clean_gdp = data_stats_interpolated_clean_gdp.reset_index(drop=True)
data_stats_interpolated_clean_gdp.to_csv(f'results/data_stats_interpolated_clean_gdp.csv')