In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings

# Ignore FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)


pd.options.mode.chained_assignment = None

import sqlite3
from dotenv import load_dotenv

load_dotenv()
import os

DB_PATH = os.getenv("DB_PATH")
DATA_PATH = "data"

conn = sqlite3.connect(DB_PATH)

In [2]:
df_score_region = pd.read_csv('../immaterial_index/results/df_region_score.csv', index_col = [0])
df_score_region.sample(5)


Unnamed: 0,region_name,decade,score
223,Arabic world,1380,7
347,Balkans,-580,1
3902,South East Asia,1180,1
5466,mediterranean World,1440,5
4911,United Kingdom,1300,3


In [3]:
df_unseen_model = pd.read_csv('../unseen_species_model/results/estimations.csv')
df_unseen_model = df_unseen_model[df_unseen_model['decade']<=1880]

df_score_region = pd.read_csv('../immaterial_index/results/df_region_score.csv', index_col=[0])
df_score_region = df_score_region.rename(columns = {'region_name':'region'})

new_df = pd.merge(df_unseen_model, df_score_region, on = ['decade', 'region'])
new_df = new_df.rename(columns = {'region':'region_name', 'decade':'year'})

#new_df['year'] = new_df['year'].astype(int)
#new_df['year'] = new_df['year'].apply(lambda x: round(x/10)*10)
#new_df = new_df.groupby(['region_name', 'year']).mean().reset_index()
#new_df = new_df[new_df['year']<=1840]


#new_df = new_df[~new_df['N_est'].isna()]
# When there is no individuals for the 'score' index, replace by 0
#new_df = new_df.fillna(0)


### Load GDP and Population Data

In [4]:
#df_region_code = pd.read_sql_query("SELECT * FROM region_code", conn)
df_region_code = pd.read_csv("../environnement_data/region_code.csv", index_col = [0])

#df_gdp = pd.read_sql_query("SELECT * FROM gdp", conn)
df_gdp = pd.read_csv("../environnement_data/gdp.csv", index_col=[0])

df_gdp = df_gdp.drop(['country_code_maddison'], axis=1)
df_gdp = df_gdp[df_gdp['year']<=1880]
df_gdp = pd.merge(df_gdp, df_region_code, on = 'region_code')
df_gdp.sample(5)

#df_gdp_clean =  pd.read_sql_query("SELECT * FROM gdp_clean", conn)
df_gdp_clean = pd.read_csv("../environnement_data/gdp_clean.csv", index_col=[0])
df_gdp_clean = pd.merge(df_gdp_clean, df_region_code, on = 'region_code')


In [5]:
#df_population = pd.read_sql_query("SELECT * FROM population", conn)
df_population = pd.read_csv("../environnement_data/population.csv", index_col=[0])

df_population = pd.merge(df_population, df_region_code, on = 'region_code')
df_population = df_population.drop('region_code', axis=1)
df_population.sample(5)


Unnamed: 0,year,population,region_name
1441,-190,1.42,Central Europe
4685,-110,0.08625,Ireland
4462,370,46.1,Indian world
8063,1140,12.03,Southwestern Europe
4771,750,0.255,Ireland


In [6]:
def interpolate_function(df, value="value", category="region_code", year="year", size_interpolation = 10):
    df_interpolated_list = []
    for reg in set(df[category]):
        res = df[df[category] == reg]

        min_date = min(res["year"])
        max_date = max(res["year"])

        year_range = np.arange(round(min_date), round(max_date), size_interpolation)
        df_year = pd.DataFrame(year_range, columns=[year]).reset_index(drop=True)

        res = pd.merge(res, df_year, on=year, how="outer")
        res[category] = reg
        res = res.sort_values(year, ascending=True)
        res[f"{value}_interpolated"] = res[value].interpolate(method="linear")
        res = res.reset_index(drop=True)
        res = res[~res[f"{value}_interpolated"].isna()]

        df_interpolated_list.append(res)

    df_interpolated = pd.concat([x for x in df_interpolated_list])
    df_interpolated = df_interpolated.reset_index(drop=True)
    df_interpolated = df_interpolated.drop(value, axis=1)

    return df_interpolated

df_population_interpolation = interpolate_function(df_population, 
                                                   value = 'population', 
                                                   year = 'year', 
                                                   category="region_name",
                                                   size_interpolation = 10)

In [7]:
max(df_gdp.year)

1880

In [8]:
# Linear interpolation on the proxies in order to merge with the GDP data excatly (that are at the year level)

final = pd.merge(df_population_interpolation, new_df, on = ['region_name', 'year'])
final['score_cap'] = final['score']/final['population_interpolated']
final['lower_cap'] = final['lower']/final['population_interpolated']
final['N_est_cap'] = final['N_est']/final['population_interpolated']
final['upper_cap'] = final['upper']/final['population_interpolated']

data_stats = pd.merge(final, df_gdp, on = ['region_name', 'year'])
data_stats.to_csv(f'results/data_stats.csv')

df_stats_clean = pd.merge(final, df_gdp_clean, on = ['region_name', 'year'])
df_stats_clean.to_csv(f'results/data_stats_clean_gdp.csv')

In [9]:
df_stats_clean.sample(5)

Unnamed: 0,year,region_name,population_interpolated,N_est,lower,upper,score,score_cap,lower_cap,N_est_cap,upper_cap,region_code,country_code_maddison,gdp_per_capita
18,1850,Eastern Europe,40.5,34513.510491,32257.234815,36855.239122,1640,40.493827,796.474934,852.185444,910.005904,re_eastern_europe,POL,985
3,1840,Nordic countries,4.762,6776.092828,6244.213315,7319.573693,600,125.99748,1311.258571,1422.951035,1537.079734,re_nordic_countries,SWE,1568
17,1840,Eastern Europe,38.25,33499.613297,31177.546347,35901.428537,1588,41.51634,815.099251,875.806884,938.599439,re_eastern_europe,POL,907
6,1870,Nordic countries,6.071,9468.356219,8598.983097,10394.339009,879,144.786691,1416.40308,1559.604055,1712.129634,re_nordic_countries,SWE,2144
5,1860,Nordic countries,5.568,9880.151116,9066.952173,10711.894204,896,160.91954,1628.403767,1774.452428,1923.831574,re_nordic_countries,SWE,1941


In [10]:
years = np.arange(-1000, 1881, 1)

print(len(final))

concat_list = []

for region in final.region_name.unique():
    df_years = pd.DataFrame(years, columns = ['year'])
    df_years['region_name'] = region
    
    final_filtered = final[final['region_name']==region]
    mix = pd.merge(df_years, final_filtered, on = ['year', 'region_name'], how = 'outer')
    concat_list.append(mix)
    
new_final = pd.concat(concat_list)
new_final = new_final.sort_values(['region_name', 'year'])

new_final = new_final.set_index('year')
grouped = new_final.groupby('region_name')

interpolated_data = []
for name, group in grouped:
    interpolated_group = group.interpolate(method='linear')
    interpolated_data.append(interpolated_group)

# Concatenate the interpolated data for all regions
interpolated_df = pd.concat(interpolated_data)

# Reset the index to restore the 'year' column
interpolated_df = interpolated_df.reset_index()
interpolated_df = interpolated_df.drop_duplicates().dropna().reset_index(drop=True)

1979


In [11]:
data_stats_interpolated = pd.merge(interpolated_df, df_gdp, on = ['region_name', 'year'])
data_stats_interpolated = data_stats_interpolated.reset_index(drop=True)
data_stats_interpolated.to_csv(f'results/data_stats_interpolated.csv')

data_stats_interpolated_clean_gdp = pd.merge(interpolated_df, df_gdp_clean, on = ['region_name', 'year'])
data_stats_interpolated_clean_gdp = data_stats_interpolated_clean_gdp.reset_index(drop=True)
data_stats_interpolated_clean_gdp.to_csv(f'results/data_stats_interpolated_clean_gdp.csv')

In [12]:
max(data_stats_interpolated.year)

1880