In [1]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import oat_python as oat
import os

  from pandas.core import (


In [7]:
# config
MIN_RELEVANCE = 0.7
MIN_YEAR = 1920
MAX_YEAR = 2021  # when the data is from
MIN_CONCEPT_FREQ = 0.0001
MAX_CONCEPT_FREQ = 0.001
YEARS_GRID = np.linspace(0, 1, 100)
INV_COUNTS_GRID = np.linspace(0, 1, 40)
max_dim = 1
years_grid = np.linspace(0, 1, 100)
inv_counts_grid = np.linspace(0, 1, 40)

def dataprocess(df):
    df = df[df['relevance_mean'] >= MIN_RELEVANCE]
    df = df[df['year'] >= MIN_YEAR]
    num_articles = df['article_id'].nunique()
    concept_freq = df.groupby('concept').transform('size') / num_articles
    df = df[(concept_freq >= MIN_CONCEPT_FREQ) & (concept_freq <= MAX_CONCEPT_FREQ)]
    df = df[['article_id', 'concept', 'year']]

    return(df)

def conceptprocess(df):
    concepts = (
            df
                .sort_values('year')
                .groupby('concept')
                .agg(
                    year=('year', 'min'),
                    count=('article_id', 'nunique')
                )
                .reset_index()
        )

    concepts['norm_year'] = (concepts['year'] - MIN_YEAR) / (MAX_YEAR - MIN_YEAR)
    concepts['inv_count'] = 1 / concepts['count']
    return(concepts)

def edgeprocess(df):
    edges = df.merge(df, on=['article_id', 'year'], suffixes=['_source', '_target'])
    edges = edges[edges['concept_source'] < edges['concept_target']]
    edges = edges.groupby(['concept_source', 'concept_target']).agg(
            year=('year', 'min'),
            count=('article_id', 'nunique')
        ).reset_index()

    edges['norm_year'] = (edges['year'] - MIN_YEAR) / (MAX_YEAR - MIN_YEAR)
    edges['inv_count'] = 1 / edges['count']
    return(edges)

def graphprocess(concepts,edges):
    G = nx.Graph()

    # add the nodes
    G.add_nodes_from([(c, {'norm_year': ny, 'inv_count': ic}) for c, ny, ic in zip(concepts['concept'], concepts['norm_year'], concepts['inv_count'])])

    # add the edges
    G.add_edges_from([(u, v, {'norm_year': ny, 'inv_count': ic}) for u, v, ny, ic in zip(edges['concept_source'], edges['concept_target'], edges['norm_year'], edges['inv_count'])])
    
    return(G)

def processbetticurve(G):
    adj_year = nx.adjacency_matrix(G, weight='norm_year')
    adj_year.setdiag([d['norm_year'] for _, d in G.nodes(data=True)])
    adj_inv_count = nx.adjacency_matrix(G, weight='inv_count')
    adj_inv_count.setdiag([d['inv_count'] for _, d in G.nodes(data=True)])
    adj_year = adj_year.sorted_indices()
    betti_curves = np.empty((len(years_grid), len(inv_counts_grid), max_dim + 1))

    return betti_curves

def runcrocker(G, years_grid, inv_counts_grid):
    adj_year = nx.adjacency_matrix(G, weight='norm_year')
    adj_year.setdiag([d['norm_year'] for _, d in G.nodes(data=True)])
    adj_inv_count = nx.adjacency_matrix(G, weight='inv_count')
    adj_inv_count.setdiag([d['inv_count'] for _, d in G.nodes(data=True)])
    adj_year = adj_year.sorted_indices()

    betti_curves = np.empty((len(years_grid), len(inv_counts_grid), max_dim + 1))

    for i, c in enumerate(inv_counts_grid):
        # zero out things not included
        c_adj = adj_year.copy()
        c_adj[adj_inv_count > c] = 0
        c_adj.eliminate_zeros()

        c_adj.setdiag([d['norm_year'] for _, d in G.nodes(data=True)])
        c_adj = c_adj.sorted_indices()

        if c_adj.nnz == 0 or c_adj.shape[0] == 0:
            for d in range(max_dim + 1):
                betti_curves[:, i, d] = 0
            continue

        try:
            factored = oat.rust.FactoredBoundaryMatrixVr(c_adj, max_dim)
            homology = factored.homology(False, False)

            for d in range(max_dim + 1):
                dim_homology = homology[homology['dimension'] == d]
                betti_curves[:, i, d] = ((dim_homology['birth'].values <= years_grid[:, None]) &
                                        (dim_homology['death'].values > years_grid[:, None])).sum(axis=1)
        except Exception as e:
            print(f"OAT error at inv_count={c:.3f}: {e}")
            for d in range(max_dim + 1):
                betti_curves[:, i, d] = 0

    return betti_curves

def mainfunc(df, years_grid, inv_counts_grid):
    df = dataprocess(df)
    concepts = conceptprocess(df)
    edges = edgeprocess(df)
    G = graphprocess(concepts, edges)
    betti_curves = runcrocker(G, years_grid, inv_counts_grid)
    return betti_curves

In [9]:
df = pd.read_csv(
        'https://www.dropbox.com/scl/fi/a1t16rtialcw03n50ffkc/concepts_Zoology_608.csv.gz?rlkey=vjv60sfbhofbgvzfzdkrlurl1&st=ciu77f72&dl=1',
        compression='gzip',
    )

In [10]:
betti = mainfunc(df,YEARS_GRID,INV_COUNTS_GRID)
betti

array([[[1.000e+00, 0.000e+00],
        [1.000e+00, 0.000e+00],
        [1.000e+00, 0.000e+00],
        ...,
        [1.000e+00, 0.000e+00],
        [1.000e+00, 0.000e+00],
        [1.000e+00, 0.000e+00]],

       [[2.000e+00, 0.000e+00],
        [2.000e+00, 0.000e+00],
        [2.000e+00, 0.000e+00],
        ...,
        [2.000e+00, 0.000e+00],
        [2.000e+00, 0.000e+00],
        [2.000e+00, 0.000e+00]],

       [[4.000e+00, 0.000e+00],
        [4.000e+00, 0.000e+00],
        [4.000e+00, 0.000e+00],
        ...,
        [4.000e+00, 0.000e+00],
        [4.000e+00, 0.000e+00],
        [4.000e+00, 0.000e+00]],

       ...,

       [[4.139e+03, 0.000e+00],
        [4.139e+03, 0.000e+00],
        [4.138e+03, 0.000e+00],
        ...,
        [2.565e+03, 8.600e+01],
        [2.565e+03, 8.600e+01],
        [3.130e+02, 3.536e+03]],

       [[4.143e+03, 0.000e+00],
        [4.143e+03, 0.000e+00],
        [4.142e+03, 0.000e+00],
        ...,
        [2.538e+03, 9.300e+01],
        [2.538e+03