In [1]:
import sys

sys.path.append("../networks")
sys.path.append("../")

import sqlite3

import pandas as pd
import polars as pl
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

from functions.datamodel import OptimumParameter
from functions.env import DB_SCIENCE_PATH, GRAPH_RESULTS, DB_SCIENCE_PATH_NEW
from functions.feat_network import get_edge_node_table
from functions.feat_visualization import sygma_graph_leiden

pd.options.mode.chained_assignment = None

conn = sqlite3.connect(DB_SCIENCE_PATH_NEW)

from optimal_clustering import optimal_clustering

dict_op = optimal_clustering
dict_op = OptimumParameter(**dict_op)

from sklearn.metrics.pairwise import euclidean_distances


def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    return dot_product / (norm1 * norm2)


In [5]:
## Load Data


paths = ['global_before_1600', 'global_1600_1700', 'chinese_world', 'arabic_world', 'indian_world', 'japan_world', 'latin_world', 'greek_world']


for path in paths:
    print(path)

    df = pd.read_csv(f"../networks/data/{path}.csv",index_col=[0])
    df["meta_occupation"] = df["meta_occupation"].apply(lambda x: x.split(" | "))
    df = df.explode("meta_occupation")
    df = df[["wikidata_id", "meta_occupation"]]
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    df.columns = ["source", "target"]
    df["weight"] = 1

    ## get co-occurence table
    df = pl.from_pandas(df)
    df_edge, df_nodes = get_edge_node_table(df)


    df_edge_filter = df_edge[
        df_edge["rank_count"] <= 3
    ]

    df_edge_filter.to_csv(f'edges_list_filtered/{path}.csv')

    # visualization
    df_partition, g = sygma_graph_leiden(
    df_edge_filter,
    df_nodes,
    edge_bins=10,
    node_bins=10,
    filepath=f'final_graph/{path}.html')

global_before_1600
global_1600_1700
chinese_world
arabic_world
indian_world
japan_world
latin_world
greek_world
