## My First Cycle Rep
Getting an early cycle rep!!!

The parameters are mostly arbitrary, and this isn't final in any way, but it's cool.

#### Preliminaries
The network threshold is 10x what Jingyi suggests in her paper.

In [1]:
# load some packages
import Gavin.utils.make_network_v1 as mn
import plotly.graph_objects as go
from time import time
import networkx as nx
import pandas as pd
import oatpy as oat
import numpy as np

# config
DATA_PATH = 'datasets/concept_network/'
# CONCEPT_FILE = 'articles_category_for_2l_abstracts_concepts_processed_v1_EX_102.csv.gz' # Applied Mathematics
# CONCEPT_FILE = 'concepts_Applied Economics_1402.csv.gz' # Applied Econ
CONCEPT_FILE = 'concepts_Zoology_608.csv' # Zoology
article_concept_df = mn.filter_article_concept_file(
        DATA_PATH+CONCEPT_FILE,
        relevance_cutoff=0.7,
        min_article_freq=0.0006, # 0.006%
        max_article_freq=0.005, # 0.05%
        normalize_year=True,
        year_min=1920
    ) # use a filtered data file

#### Problem Setup
Take the concept-article dataframe and turn it into a network, then a distance matrix.

In [2]:
G = mn.gen_concept_network(article_concept_df) # make the graph
adj = nx.adjacency_matrix(G, weight='norm_year') # adjacency matrix
node_births = np.array(list(nx.get_node_attributes(G, 'norm_year').values())) # node orgin years, these break the cycle reps (idk why)
adj.setdiag(node_births)
adj = adj.sorted_indices()

#### Homology Calculation
We setup and calculate homology, then do some basic visualizations.

In [3]:
start = time()

# setup the problem
factored = oat.rust.FactoredBoundaryMatrixVr( # two functions that do this, idk what the other one is
        dissimilarity_matrix=adj,
        homology_dimension_max=2
    )

# solve homology
homology = factored.homology( # solve homology
        return_cycle_representatives=True, # These need to be true to be able to make a barcode, makes the problem take ~30% longer (1:30ish)
        return_bounding_chains=True
    )

f'Homology calculation took {time() - start} secs'

'Homology calculation took 1.9817090034484863 secs'

In [4]:
# persistance diagram
fig = oat.plot.pd(homology)
fig.update_layout(
        width=600, 
        height=500,
        margin=dict(l=20, r=20, t=20, b=20)
    )
fig.show()

In [5]:
# Barcode diagram
fig = oat.plot.barcode(homology)
fig.update_layout(
        width=1000, 
        height=500,
        margin=dict(l=20, r=20, t=20, b=20)
    )
fig.show()

#### Cycle Rep
Find a cycle rep!

In [6]:
homology[homology['dimension'] == 1]

Unnamed: 0_level_0,dimension,birth,death,birth simplex,death simplex,cycle representative,cycle nnz,bounding chain,bounding nnz
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
226,1,1.000000,inf,"[244, 280]",,simplex filtration coefficient 0 [24...,11,,
227,1,1.000000,inf,"[220, 264]",,"simplex filtration coefficient 0 [220,...",9,,
228,1,1.000000,inf,"[98, 191]",,"simplex filtration coefficient 0 [98,...",6,,
229,1,1.000000,inf,"[86, 290]",,simplex filtration coefficient 0 [8...,23,,
230,1,1.000000,inf,"[50, 116]",,simplex filtration coefficient 0 [5...,21,,
...,...,...,...,...,...,...,...,...,...
662,1,0.742574,0.910891,"[123, 124]","[123, 227, 250]","simplex filtration coefficient 0 [123,...",4,simplex filtration coefficient 0 ...,18.0
663,1,0.742574,inf,"[13, 77]",,simplex filtration coefficient 0 [13...,9,,
664,1,0.722772,0.762376,"[13, 164]","[59, 148, 164]","simplex filtration coefficient 0 [13,...",6,simplex filtration coefficient 0 [...,4.0
665,1,0.683168,0.811881,"[59, 164]","[210, 212, 215]",simplex filtration coefficient 0 [5...,12,simplex filtration coefficient 0 ...,10.0


In [7]:
## Representative Cycle
index_in_homology = 666 # i think this is a cool one

# optimization problem
start = time()
optimal = factored.optimize_cycle(
        birth_simplex=homology['birth simplex'][index_in_homology], 
        problem_type='preserve PH basis'
    )
print(f'Optimizaiton took {time() - start} secs')

dirty_optimal = optimal.loc['optimal cycle', 'chain'] # round all the dumb and bad coefficicents
clean_optimal = dirty_optimal[round(dirty_optimal['coefficient'].astype(float)) != 0]
print(f'Removing {len(dirty_optimal)-len(clean_optimal)}/{len(dirty_optimal)} degenerate simplexes, {len(clean_optimal)} simplicies left')

# rep_cycle_nodes = list(np.array(G.nodes)[pd.to_numeric(clean_optimal['simplex'].explode().drop_duplicates()).tolist()])

unordered_edges = clean_optimal['simplex'].tolist()
rep_cycle_nodes = [unordered_edges.pop()[1]]
while len(unordered_edges) > 0:
    n = rep_cycle_nodes[-1]
    for e in unordered_edges:
        if n in e:
            break
    if e[-1] == n:
        e.reverse()
    rep_cycle_nodes.append(e[1])
    unordered_edges.remove(e)
rep_cycle_nodes = list(np.array(G.nodes)[rep_cycle_nodes])

rep_cycle_nodes


Finished construcing L1 optimization program.
Constraint matrix has 61 nonzero entries.
Passing program to solver.
Optimizaiton took 0.01190805435180664 secs
Removing 0/5 degenerate simplexes, 5 simplicies left

Done solving.
MINILP solution: Solution { direction: Minimize, num_vars: 68, num_constraints: 96, objective: 2.435643564356436 }


['pest population',
 'pest control strategy',
 'pest management',
 'insect pest management',
 'crop production']

In [8]:
# Representative bounding chain
start = time()
optimal = factored.optimize_bounding_chain(
        birth_simplex=homology['birth simplex'][index_in_homology], 
    )
print(f'Optimizaiton took {time() - start} secs')

if optimal is None:
    bounding_chain_nodes = []
else:
    dirty_optimal = optimal.loc['optimal bounding chain', 'chain']
    clean_optimal = dirty_optimal[round(dirty_optimal['coefficient'].astype(float)) != 0]
    print(f'Removing {len(dirty_optimal)-len(clean_optimal)}/{len(dirty_optimal)} degenerate simplexes, {len(clean_optimal)} simplicies left')

    bounding_chain_tris = [list(np.array(G.nodes)[tri]) for tri in clean_optimal['simplex']]
    bounding_chain_nodes = list(np.array(G.nodes)[pd.to_numeric(clean_optimal['simplex'].explode().drop_duplicates()).tolist()])
bounding_chain_nodes


Finished construcing L1 optimization program.
Constraint matrix has 166 nonzero entries.
Passing program to solver.
Optimizaiton took 0.0983128547668457 secs
Removing 0/5 degenerate simplexes, 5 simplicies left

Done solving.
MINILP solution: Solution { direction: Minimize, num_vars: 140, num_constraints: 206, objective: 3.336633663366337 }
max difference in boundaries: None


['pest control strategy',
 'pest management',
 'pest population',
 'crop production',
 'insect pest management',
 'insect population',
 'integrate pest management']

#### Visualizer
Create a visualization of the hole and how it closes using the cycle rep and bounding chain.

In [9]:
# params
np.random.seed(10)
MIN_YEAR = 1920
MAX_YEAR = 2021
denorm_year = lambda ny: min(MAX_YEAR+1, np.int64(ny * (MAX_YEAR-MIN_YEAR) + MIN_YEAR))

diff_nodes = list(set(bounding_chain_nodes) - set(rep_cycle_nodes)) # nodes not in cycle that help close it
cycle_G = G.subgraph(set(bounding_chain_nodes + rep_cycle_nodes)) # graph of all nodes in the cycle

# birth = denorm_year(homology.loc[i, 'birth']) # animate between birth and death
birth = min(nx.get_node_attributes(cycle_G, 'year').values())-1
death = denorm_year(homology.loc[index_in_homology, 'death'])

theta = np.linspace(0, 2*np.pi, len(rep_cycle_nodes)+1)[:-1]
x = np.cos(theta)
y = np.sin(theta)

pos = dict()
for i, n in enumerate(rep_cycle_nodes):
    pos[n] = np.array([x[i], y[i]])
for n in diff_nodes:
    pos[n] = np.random.normal(0, 0.01, size=2)
if len(diff_nodes) > 0:
    pos = nx.spring_layout(cycle_G, k=2/np.sqrt(len(bounding_chain_nodes)), pos=pos, fixed=rep_cycle_nodes)

def viz_graph(G, yr):
    # triangle locations
    t_x = [] # triangle x
    t_y = [] # triangle y
    for t in bounding_chain_tris:
        t_yr = max([cycle_G.edges[e]['year'] for e in zip(t, t[1:]+[t[0]])]) # year triangle exists
        # we offset one list and pair everything up. Then find the year for each pair of edges and get the max
        if t_yr <= yr:
            a, b, c = t # triangle nodes
            a_x, a_y = pos[a] # a position
            b_x, b_y = pos[b] # b position
            c_x, c_y = pos[c] # c position
            t_x += [a_x, b_x, c_x, a_x, None]
            t_y += [a_y, b_y, c_y, a_y, None]

    tri_trace = go.Scatter(
            x=t_x, y=t_y,
            hoverinfo='none',
            mode='none',
            fill='toself',
            fillcolor='gray',
            opacity=0.25
        )

    # edge locations
    e_x = [] # edge x
    e_y = [] # edge y
    for e in G.edges:
        if cycle_G.edges[e]['year'] <= yr:
            u, v = e # edge goes from u to v
            u_x, u_y = pos[u] # u position
            v_x, v_y = pos[v] # v position
            e_x += [u_x, v_x, None]
            e_y += [u_y, v_y, None]

    edge_trace = go.Scatter(
            x=e_x, y=e_y,
            hoverinfo='none',
            mode='lines',
            line=dict(width=5, color='#888')
        )

    # node locations
    n_x = [] # node x
    n_y = [] # node y
    n_t = [] # node text
    for n in G.nodes:
        if cycle_G.nodes[n]['year'] <= yr:
            x, y = pos[n]
            n_x.append(x)
            n_y.append(y)
            n_t.append(n.title())
        
    node_trace = go.Scatter(
            x=n_x, y=n_y,
            hoverinfo='none',
            mode='markers+text',
            text=n_t,
            marker=dict(
                    size=25,
                    line_width=2
                )
        )
    
    return tri_trace, edge_trace, node_trace

# Add traces
frames = []
for yr in range(birth, death+1):
    tri_trace, edge_trace, node_trace = viz_graph(cycle_G, yr) # viz objects

    # add as frame
    frames.append(go.Frame(
            data=[tri_trace, edge_trace, node_trace],
            name=yr
        ))
    if yr == birth:
        tri_0_trace, edge_0_trace, node_0_trace = tri_trace, edge_trace, node_trace

# create figure
fig = go.Figure(data=[tri_0_trace, edge_0_trace, node_0_trace], frames=frames)

## the rest is coped from the plotly documentation example on mri volume slices
def frame_args(duration):
    return {
            'frame': {'duration': duration},
            'mode': 'immediate',
            'fromcurrent': True,
            'transition': {'duration': 0, 'easing': 'linear'},
        }
fig.update_layout(
        showlegend=False,
        width=500, 
        height=550,
        margin=dict(l=20, r=20, t=20, b=20),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[1.2*min([x[0] for x in pos.values()]), 1.2*max([x[0] for x in pos.values()])]),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[1.2*min([x[1] for x in pos.values()]), 1.2*max([x[1] for x in pos.values()])]),
        updatemenus = [dict(
                buttons=[
                        dict(
                                args=[None, frame_args(200)],
                                label='&#9654;', # play symbol
                                method='animate'
                            ),
                        dict(
                                args=[None, frame_args(0)],
                                label='&#9724;', # play symbol
                                method='animate'
                            )
                    ],
                direction='left',
                pad=dict(l=0, r=0, t=10, b=10),
                type='buttons',
                x=0.1,
                y=0
            )],
        sliders=[
                dict(
                        pad=dict(l=15, r=0, t=10, b=10),
                        len=0.9,
                        x=0.1,
                        y=0,
                        steps=[dict(
                                args=[[f.name], frame_args(0)],
                                label=f.name,
                                method='animate'
                            ) for f in fig.frames],
                    )
            ]
)
fig.show()