In [8]:
import sqlite3
import pandas as pd
import numpy as np
import os
import polars as pl

import random
random.seed(42)

import sys

sys.path.append('../')
from src.feat_network import get_edge_node_table, filter_edge_table
from src.feat_visualization import sygma_graph
from src.datamodel import OptimumParameter
from tqdm import tqdm
from sklearn.metrics import adjusted_rand_score


In [9]:
from dotenv import load_dotenv

load_dotenv()

True

In [10]:
conn_full_db = sqlite3.connect(os.getenv("FULL_DB_PATH"))
conn = sqlite3.connect("../database.db")

df_ind_regions = pd.read_sql_query(
        "SELECT * FROM individuals_regions", conn_full_db
    )
df_ind_regions = df_ind_regions.rename(
        columns={"individual_wikidata_id": "wikidata_id"}
    )

In [11]:
df_occupation = pd.read_sql("SELECT * FROM individual_id_cleaned_occupations", conn)
df_indi = pd.merge(df_ind_regions, df_occupation, on = 'wikidata_id')
df_indi = df_indi[['wikidata_id', 'region_code']].drop_duplicates()
df_region_count = df_indi.groupby('region_code')['wikidata_id'].count().rename('count_individuals').reset_index()

In [12]:
df_baseline = pd.read_sql("SELECT * FROM optimal_partition", conn)
df_baseline = df_baseline.rename(columns = {'community':'community_baseline'})

df_region_parition = pd.read_sql("SELECT * FROM region_optimized_partition", conn)




In [70]:
pivot_df = df_region_parition.pivot_table(index='node', columns='region_code', values='community', fill_value=np.nan)
#new_columns = ['community_' + region for region in pivot_df.columns]
#pivot_df.columns = new_columns
pivot_df = pivot_df.reset_index()
df = pd.merge(df_baseline, pivot_df, on = 'node')
df = df.set_index('node')

baseline_community = list(df['community_baseline'])

fina_list = []
for col in df.columns:
    
    #new_df = df[['community_baseline', col]].dropna()
    new_df = df[['community_baseline', col]].fillna(3)

    ari = adjusted_rand_score(list(new_df[col]), list(new_df['community_baseline']))
    fina_list.append({'region':col, 'ari_baseline':ari})
    
df_region_ari = pd.DataFrame(fina_list)
df_region_ari = df_region_ari.sort_values('ari_baseline', ascending=False)
df_region_ari = df_region_ari.reset_index(drop=True)

In [71]:
pd.options.mode.chained_assignment = None

In [72]:

regions = list(df_region_count['region_code'])
count_individuals = list(df_region_count['count_individuals'])

dict_regions_count = {x: y for x, y in zip(regions, count_individuals)}

In [73]:
all_regions = list(df_region_count.region_code)
all_individuals = list(set(df_occupation.wikidata_id))

In [74]:

optimal_parameters = pd.read_sql("SELECT * FROM optimization", conn)
optimal_parameters = optimal_parameters.sort_values("mean", ascending=False)

dict_op = optimal_parameters.iloc[0].to_dict()
dict_op = OptimumParameter(**dict_op)

In [75]:
directory = "../cache"
if not os.path.exists(directory):
    os.makedirs(directory)

In [79]:
final_clustering= []
all_regions = list(df_region_count.region_code)
for region_code in tqdm(all_regions):
    
    random_clustering = []
    for seed in np.arange(5):
        random.seed(seed)    
        try:

            # take a sample of individuals as big as the region
            sample_len = dict_regions_count.get(region_code)
            sample_ids = random.sample(all_individuals, sample_len)
            df_sample = df_occupation[df_occupation['wikidata_id'].isin(sample_ids)]
            df_sample.columns = ["source", "target"]
            df_sample["weight"] = 1

            # Draw the graph
            df_sample = pl.from_pandas(df_sample)

            df_edge, df_nodes = get_edge_node_table(df_sample)

            df_edge_filter = filter_edge_table(
                df_edge,
                edge_rule=dict_op.edge_rule,
                top_directed_neighbours=dict_op.n_neighbours,
                normalize_on_top=False,
                min_count_link=0,
            )

            df_partition = sygma_graph(
                df_edge_filter,
                df_nodes,
                edge_bins=10,
                node_bins=10,
                resolution=dict_op.resolution,
                filepath="../cache/cache_graph.html",
            )    

            df_partition = df_partition.rename(columns = {'community':f'community_{seed}'})
            random_clustering.append(df_partition)

        except:
            pass
        
    merged_df = random_clustering[0]
    for table in random_clustering[1:]:
        merged_df = pd.merge(merged_df, table, on=["node"], how = 'outer')
    merged_df = merged_df.set_index("node")
    merged_df['region_code'] = region_code
    final_clustering.append(merged_df)


100%|██████████████████████████████████████████████████████████████████████████| 53/53 [01:44<00:00,  1.97s/it]


In [80]:
df_final_clustering = pd.concat([x for x in final_clustering])

In [85]:
df_final_clustering[df_final_clustering['region_code']=='re_france']

Unnamed: 0_level_0,community_0,community_1,community_2,community_3,community_4,region_code
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
zoologist,0.0,0.0,0.0,1.0,0.0,re_france
pharmacologist,0.0,0.0,0.0,1.0,0.0,re_france
meteorologist,0.0,1.0,2.0,1.0,0.0,re_france
taxonomist,0.0,,,1.0,0.0,re_france
epidemiologist,0.0,,2.0,0.0,,re_france
physicist,0.0,1.0,2.0,0.0,0.0,re_france
chemist,0.0,0.0,0.0,1.0,0.0,re_france
ecologist,0.0,0.0,,1.0,0.0,re_france
botanist,0.0,0.0,0.0,1.0,0.0,re_france
geologist,0.0,0.0,0.0,1.0,0.0,re_france


In [82]:
# Merge the different clustering of the different samples together

In [266]:
pivot_sample = df_res.pivot_table(index='node', columns='region_code', values='community', fill_value=np.nan)
pivot_sample = pivot_sample.reset_index()

df_sample_ari = pd.merge(df_baseline, pivot_sample, on = 'node')

In [267]:
baseline_community = list(df_sample_ari['community_baseline'])

final_list = []
for col in df_sample_ari.columns:
    
    #new_df = df_sample_ari[['community_baseline', col]].dropna()
    new_df = df_sample_ari[['community_baseline', col]].fillna(3)
    ari = adjusted_rand_score(list(new_df[col]), list(new_df['community_baseline']))
    final_list.append({'region':col, 'ari_baseline_random':ari})
    
df_region_ari_sample = pd.DataFrame(final_list)
df_region_ari_sample = df_region_ari_sample.sort_values('ari_baseline_random', ascending=False)
df_region_ari_sample = df_region_ari_sample.reset_index(drop=True)

In [277]:
final = pd.merge(df_region_ari, df_region_ari_sample, on = 'region')
final['diff'] = final['ari_baseline'].abs() - final['ari_baseline_random'].abs()
final= final.sort_values('diff', ascending=False)

In [280]:
import pandas as pd
import scipy.stats as stats

# Create a DataFrame with the given data
data = {
    'region': ['re_netherlands', 're_greece', 're_balkans', 're_low_countries', 're_italy'],
    'ari_baseline': [0.597622, 0.318832, 0.583738, 0.783382, 0.783382],
    'ari_baseline_random1': [0.283342, 0.061242, 0.334777, 0.551266, 0.584625],
    'ari_baseline_random2': [0.285211, 0.064823, 0.339426, 0.552378, 0.589475],
    'ari_baseline_random3': [0.286567, 0.067751, 0.341977, 0.553731, 0.594327],
    'ari_baseline_random4': [0.281912, 0.060123, 0.332045, 0.548859, 0.581312],
    'ari_baseline_random5': [0.275824, 0.058891, 0.324100, 0.542982, 0.577685],
    'ari_baseline_random6': [0.276913, 0.059792, 0.326660, 0.546105, 0.581919],
    'ari_baseline_random7': [0.280104, 0.062382, 0.330170, 0.550226, 0.587156],
    'ari_baseline_random8': [0.273156, 0.057910, 0.321339, 0.537025, 0.573972],
    'ari_baseline_random9': [0.269899, 0.054901, 0.317730, 0.530938, 0.569556],
    'ari_baseline_random10': [0.272436, 0.057354, 0.320753, 0.534206, 0.573189]
}
df = pd.DataFrame(data)

In [None]:
# # Perform paired t-test for each region
for region in df['region']:
    ari_baseline = df.loc[df['region'] == region, 'ari_baseline']
    ari_baseline_random = df.loc[df['region'] == region, [f'ari_baseline_random{i}' for i in range(1, 11)]]
    t_statistic, p_value = stats.ttest_rel(ari_baseline_random.values.flatten(), ari_baseline.values)

In [291]:
import numpy as np
import scipy.stats as stats

# Define the differences between ari_baseline and ari_random
differences = np.array([0.314280, 0.257590, 0.248961, 0.232116, 0.198757])

# Perform one-sample t-test
t_statistic, p_value = stats.ttest_1samp(differences, 0)

# Print the p-value
print("p-value:", p_value)


p-value: 0.00018729795354910595


In [18]:
# Region | ARI region vs baseline | ARI sample same size region vs baseline


In [11]:
from sklearn.metrics import adjusted_rand_score
