In [None]:
df_final.to_csv('results/df_individuals_score.csv')


df_occupations = pd.read_sql_query("SELECT * FROM individual_occupations", conn)
df_occupations = df_occupations[df_occupations['occupations_category'].str.contains('science')]


df_complexity = pd.merge(df_final,df_occupations, on = 'individual_wikidata_id')
df_complexity_group = df_complexity[['occupations_wikidata_id', 'region_name', 'decade']].copy()
df_complexity_group = df_complexity_group.groupby(['region_name', 'decade'])['occupations_wikidata_id'].apply(lambda x : len(set(x))).reset_index()
df_complexity_group = df_complexity_group.rename(columns={'occupations_wikidata_id':'score'})
df_test = df_complexity[['individual_wikidata_id', 'occupations_wikidata_id', 'decade', 'region_name']]
df_test['decade'] = df_test['decade'].apply(lambda x: round(x / 100) * 100)



df_complexity_total = df_test.groupby(['decade', 'region_name'])['occupations_wikidata_id'].apply(lambda x: len(set(x))).reset_index()
test = df_complexity_total[df_complexity_total['region_name']=='Chinese world']
test = test[test['decade']>=900]

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(test['decade'], test['occupations_wikidata_id'], label='Unique Occupations', color='blue')
plt.xlabel('')
plt.ylabel('Unique Occupations')
plt.title('Chinese world')
plt.legend()
plt.grid(True)
plt.show()

import pandas as pd
import numpy as np
from sklearn.utils import resample
from scipy.stats import sem, t
import random


# Define a function to calculate unique occupations in a sample
def calculate_unique_occupations(sample_data):
    return len(sample_data['occupations_wikidata_id'].unique())

# Number of samples
num_samples = 50
n_ind = 5

grouped = df_test.groupby(['region_name', 'decade'])
results = []
for (region, decade), group in grouped:
    sample_results = []
    for _ in range(num_samples):
        if len(group)<n_ind:
              sample = group
        else:
            sample = group.sample(n_ind)
        # n_unique_ind = list(set(group.individual_wikidata_id))
        # if len(n_unique_ind)<n_ind:
        #     sample = group
        # else:
        #     sample_individuals = random.sample(n_unique_ind, n_ind)
        #     sample = group[group['individual_wikidata_id'].isin(sample_individuals)]
        unique_occupations = calculate_unique_occupations(sample)
        sample_results.append(unique_occupations)
    median_unique_occupations = np.median(sample_results)
    ci = t.interval(0.95, len(sample_results) - 1, loc=np.median(sample_results), scale=sem(sample_results))
    results.append({'region_name': region, 'decade': decade, 'median_unique_occupations': median_unique_occupations,
                    'lower_bound': ci[0], 'upper_bound': ci[1]})
    


# Create a DataFrame for the results
result_df = pd.DataFrame(results)
result_df['lower_bound'].fillna(result_df['median_unique_occupations'], inplace=True)
result_df['upper_bound'].fillna(result_df['median_unique_occupations'], inplace=True)
result_df['median_unique_occupations'] = result_df['median_unique_occupations'].astype(float)
result_df['lower_bound'] = result_df['lower_bound'].astype(float)
result_df['upper_bound'] = result_df['upper_bound'].astype(float)

result_df['decade'] = result_df['decade'].astype(int)
result_df.to_csv('results/df_region_score_complexity.csv')

test = result_df[result_df['region_name']=='Chinese world']
test = test[test['decade']>=900]

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(test['decade'], test['median_unique_occupations'], label='Median Unique Occupations', color='blue')
plt.fill_between(test['decade'], test['lower_bound'], test['upper_bound'], color='skyblue', alpha=0.4, label='Confidence Interval (95%)')
plt.xlabel('')
plt.ylabel('Unique Occupations')
plt.title('Chinese world')
plt.legend()
plt.grid(True)
plt.show()


