In [50]:
import sys

sys.path.append("../")
sys.path.append("../networks")

import sqlite3

import pandas as pd
import polars as pl

from functions.datamodel import OptimumParameter
from functions.env import DB_SCIENCE_PATH, FULL_DB_PATH, GRAPH_RESULTS
from functions.feat_network import filter_edge_table, get_edge_node_table
from functions.feat_visualization import sygma_graph, sygma_graph_2

conn_full_db = sqlite3.connect(FULL_DB_PATH)
conn = sqlite3.connect(DB_SCIENCE_PATH)

# optimal_parameters = pd.read_sql("SELECT * FROM optimization_100", conn)
optimal_parameters = pd.read_sql("SELECT * FROM optimization_europe", conn)

optimal_parameters = optimal_parameters.sort_values("mean", ascending=False)
dict_op = optimal_parameters.iloc[0].to_dict()
dict_op = OptimumParameter(**dict_op)


from optimal_clustering import optimal_clustering

dict_op = optimal_clustering
dict_op = OptimumParameter(**dict_op)


from region_filters import columns_eu

In [51]:
from region_filters import columns_non_eu

df_ind_regions = pd.read_sql_query(
    "SELECT * FROM individuals_regions", conn_full_db
)
df_ind_regions = df_ind_regions.rename(
    columns={"individual_wikidata_id": "wikidata_id"}
)

df_ind_regions = df_ind_regions[df_ind_regions["region_code"].isin(columns_non_eu)]

df_occupation = pd.read_sql("SELECT * FROM individual_id_cleaned_occupations", conn)
df_occupation.columns = ["source", "target"]
df_occupation["weight"] = 1

wiki_ids = list(set(df_ind_regions["wikidata_id"]))
df = df_occupation[df_occupation["source"].isin(wiki_ids)]
df = df.drop_duplicates()
print(len(df))

1099


In [52]:
df_occupation = pd.read_sql("SELECT * FROM individual_id_cleaned_occupations", conn)

df_temporal = pd.read_sql("SELECT * FROM temporal_data", conn)
df_temporal = df_temporal[df_temporal["region_code"].isin(columns_eu)]
df_temporal = df_temporal[["wikidata_id", "birthyear"]]
df_temporal = df_temporal[df_temporal["birthyear"] <= 1700]
print(len(set(df_temporal.wikidata_id)))

df = pd.merge(df_occupation, df_temporal, on="wikidata_id")
df = df.drop("birthyear", axis=1)
df = df.drop_duplicates()

df.columns = ["source", "target"]
df["weight"] = 1


3881


In [53]:
df = pl.from_pandas(df)
df_edge, df_nodes = get_edge_node_table(df, sample = len(df))

In [54]:
dict_op.n_neighbours = 3

In [63]:
#df_edge[df_edge['source']=='epidemiologist']

In [None]:
threshold_percentage = 10
max_weight = df['weight'].max()
threshold_value = (threshold_percentage / 100) * max_weight

# Filter edges based on the threshold
filtered_edges = df[df['weight'] > threshold_value]

df_edge_filter = filter_edge_table(
    df_edge,
    edge_rule='count',
    top_directed_neighbours=dict_op.n_neighbours,
    normalize_on_top=False,
    min_count_link=dict_op.min_count_link,
)

In [None]:
max(df_edge.weight

In [58]:
df_edge[df_edge['weight']>1][['source', 'target', 'weight']]

Unnamed: 0,source,target,weight
0,zoologist,botanist,53
1,zoologist,historian,23
2,zoologist,mathematician,22
3,zoologist,philosopher,20
4,zoologist,physicist,18
...,...,...,...
369,anatomist,archeologist,6
370,anatomist,geologist,6
371,anatomist,geographer,4
372,anatomist,musicologist,2


In [None]:
, you might decide to keep edges with weights above 10% of the maximum weight.

In [7]:
import community.community_louvain as community
import networkx as nx
import numpy as np
import pandas as pd
from ipysigma import Sigma
from sklearn.preprocessing import MinMaxScaler


In [8]:
edge_bins = 10
node_bins = 10
resolution = 1
filepath=GRAPH_RESULTS + "/non_europe.html"

In [9]:
df_edges = df_edge_filter.copy()

df_edges["weight"] = pd.qcut(
    df_edges["weight"].rank(method="first"), edge_bins, np.arange(1, edge_bins + 1)
).astype(int)
df_edges = df_edges[["source", "target", "weight"]].reset_index(drop=True)

# Create Graph Object
g = nx.from_pandas_edgelist(
    df_edges, source="source", target="target", edge_attr="weight"
)

In [14]:
from igraph import Graph

In [15]:
import leidenalg as la

In [48]:
ig_graph = Graph(directed=False)
g_ig = ig_graph.from_networkx(g)
part = la.find_partition(g_ig, la.ModularityVertexPartition, n_iterations = 100)

In [32]:
node_names = []
indexes = []

for element in g_ig.vs:
    node_names.append(element['_nx_name'])
    indexes.append(element.index)
    
df_elements = pd.DataFrame({'element_id':indexes, 'node':node_names})

In [33]:
# Accessing the nodes in each community (partition)
communities = []
for community_nodes in part:
    community = set(community_nodes)
    communities.append(community)

In [38]:
clusters_list = list(part)

# Create a DataFrame with 'element_id' and 'cluster_id' columns
df_partition = pd.DataFrame([(element_id, cluster_id) for cluster_id, elements in enumerate(clusters_list) for element_id in elements],
                  columns=['element_id', 'community'])

df_partition = pd.merge(df_partition, df_elements, on = 'element_id')
df_partition = df_partition.drop('element_id', axis=1)

df_partition = df_partition.reset_index(drop=True)

In [42]:
nodes = list(df_partition.node)
communities = list(df_partition.community)

In [43]:
#optimiser = la.Optimiser()
#diff = optimiser.optimise_partition(part)

In [46]:
#get the clusters and add it to the graph object
#partition = community.best_partition(g, resolution=resolution, random_state=42)

# Randomy change the colors
# from functions.feat_utils import shuffle_numbers_dict

# shuffled_dict = shuffle_numbers_dict(list(set(partition.values())))
# partition = {key: shuffled_dict[value] for key, value in partition.items()}


for node, community_id in zip(nodes, communities):
#for node, community_id in partition.items():
    g.nodes[node]["community"] = community_id

#df_partition = pd.DataFrame(partition, index=[0]).T.reset_index()
#df_partition.columns = ["node", "community"]
#df_partition = df_partition.sort_values("community")
#df_partition["community"] = df_partition["community"].astype(int)

# Add the nodes
df_nodes["size"] = pd.qcut(
    df_nodes["sum_weight"].rank(method="first"),
    node_bins,
    np.arange(1, node_bins + 1),
).astype(int)

scaler = MinMaxScaler(feature_range=(1, node_bins))
df_nodes["size"] = scaler.fit_transform(df_nodes[["size"]])

for _, row in df_nodes.iterrows():
    node_id = row["node"]
    node_size = int(row["size"])

    if node_id in list(g.nodes):
        g.add_node(node_id, node_size=node_size)

# g = nx.DiGraph(g)

Sigma.write_html(
    g,
    filepath,
    # node_size=g.degree,
    raw_node_size="node_size",
    node_size="node_size",
    fullscreen=True,
    node_color="community",
    edge_size="weight",
    node_label_color="community",
    node_label_color_palette="Dark2",
    node_color_palette="Dark2",
    start_layout=True,
    show_all_labels=True,
    # edge_color_palette="Dark2",
    # edge_color="community",
    # edge_color_from="source",
    # default_node_halo_color = 'white',
    # node_halo_color = 'white',
    # raw_node_halo_color = 'white',
    edge_size_range=(1, 5),
    node_size_range=(3, 20),
    max_categorical_colors=len(set(df_partition["community"])),
    default_edge_type="curve",
    node_border_color_from="node",
    default_node_label_size=25,
    # node_label_size=g.degree,
    # node_label_size="node_size",
    node_label_size_range=(7, 20),
)