# Load networks

In [None]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import numpy as np
import pandas as pd
import networkx as nx

In [None]:
path_networks = '/Users/harangju/Developer/data/wiki/graphs/'

In [None]:
topics = [
    'anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
    'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
    'energy', 'optics', 'earth science', 'geology', 'meteorology',
    'philosophy of language', 'philosophy of law', 'philosophy of mind',
    'philosophy of science', 'economics', 'accounting', 'education',
    'linguistics', 'law', 'psychology', 'sociology', 'electronics',
    'software engineering', 'robotics',
    'calculus', 'geometry', 'abstract algebra',
    'Boolean algebra', 'commutative algebra', 'group theory', 'linear algebra',
    'number theory', 'dynamical systems and differential equations'
]

In [None]:
networks = {}
for topic in topics:
    print(topic, end=' ')
#     networks[topic] = wiki.Net()
#     networks[topic].load_graph(path_networks+'dated/'+topic+'.pickle')
    networks[topic] = wiki.Net(
        path_graph=os.path.join(path_networks, 'dated', topic + '.pickle'),
        path_barcodes=os.path.join(path_networks, 'dated', topic + '.barcode')
    )

In [None]:
topics += ['physics']
networks['physics'] = wiki.Net()
networks['physics'].load_graph(os.path.join(path_networks, 'dated', 'physics.pickle'))

In [None]:
num_nulls = 10
null_target = {}
for topic in topics:
    print(topic, end=' ')
    null_target[topic] = []
    for i in range(num_nulls):
        network = wiki.Net()
        network.load_graph(
            os.path.join(path_networks,'null-target',f"{topic}-null-{i}.pickle")
        )
        null_target[topic].append(network)

In [None]:
num_nulls = 10
null_year = {}
for topic in topics:
    print(topic, end=' ')
    null_year[topic] = []
    for i in range(num_nulls):
        network = wiki.Net()
        network.load_graph(
            os.path.join(path_networks,'null-year',f"{topic}-null-{i}.pickle")
        )
        null_year[topic].append(network)

In [None]:
num_nulls = 1
null_jitter = {}
for topic in topics:
    print(topic, end=' ')
    null_jitter[topic] = []
    for i in range(num_nulls):
        network = wiki.Net()
        network.load_graph(
            os.path.join(path_networks, 'null-jitter [-1,1]', f"{topic}-null-{i}.pickle")
        )
        null_jitter[topic].append(network)

# Plot

In [None]:
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.subplots as ps
from IPython.display import display
plotly.offline.init_notebook_mode(connected=True)

In [None]:
# path_fig = '/Users/harangju/Box Sync/Research/my papers/wikipedia/results/'
path_fig = '/Users/harangju/Library/Mobile Documents/com~apple~CloudDocs/Documents/' +\
    'research/wikipedia/results'
save_fig = False

# Growing networks

In [None]:
comm_t = pd.DataFrame()
for topic, network in networks.items():
    print(topic, end=' ')
    comm_t = pd.concat([comm_t] +
                       [pd.DataFrame([[topic,
                                       node,
                                       network.graph.nodes[node]['year'],
                                       network.graph.nodes[node]['community'],
                                       network.graph.nodes[node]['core_be'],
                                       network.graph.nodes[node]['core_rb'],
                                       1]],
                                     columns=['topic','node','year',
                                              'comm','core_be','core_rb',
                                              'count'])
                        for node in network.graph.nodes],
                       ignore_index=True)
comm_t = comm_t.merge(comm_t.groupby(['topic','comm'])['count'].sum(),
                      on=['topic','comm'],
                      suffixes=('','_topic_comm'))\
               .merge(comm_t.groupby(['topic','core_be'])['count'].sum(),
                      on=['topic','core_be'],
                      suffixes=('','_topic_core_be'))\
               .sort_values(by=['topic','year'])\
               .reset_index(drop=True)
comm_t['comm_count'] = comm_t.groupby(['topic','comm'])['count']\
                             .transform(pd.Series.cumsum)
comm_t['core_be_count'] = comm_t.groupby(['topic','core_be'])['count']\
                                .transform(pd.Series.cumsum)
comm_t['comm_frac'] = comm_t['comm_count']/comm_t['count_topic_comm']
comm_t['core_be_frac'] = comm_t['core_be_count']/comm_t['count_topic_core_be']
comm_t = comm_t.drop(['count','count_topic_comm','count_topic_core_be'], axis=1)

In [None]:
comm_t

In [None]:
community = 1
network = networks['anatomy']
[node for node in network.graph.nodes if network.graph.nodes[node]['community']==community][:3]

In [None]:
communities = set([network.graph.nodes[node]['community'] for node in network.graph.nodes])

In [None]:
import operator

In [None]:
hubs = {}
for topic in topics:
    hubs[topic] = {}
    network = networks[topic]
    communities = set([network.graph.nodes[node]['community'] for node in network.graph.nodes])
    for community in communities:
        subgraph = network.graph.subgraph(
            [node for node in network.graph.nodes if network.graph.nodes[node]['community']==community]
        )
        hub = max(dict(subgraph.degree).items(), key=operator.itemgetter(1))[0]
        hubs[topic][community] = hub

# Modularity

In [None]:
import os

path_plot = '5 modules'

if not os.path.exists(f"{path_fig}/{path_plot}"):
    os.mkdir(f"{path_fig}/{path_plot}")

## Count

In [None]:
import os

if not os.path.exists(f"{path_fig}/{path_plot}/count/"):
    os.mkdir(f"{path_fig}/{path_plot}/count")

In [None]:
for topic in ['physics']: #networks.keys():
    fig = go.Figure()
    data = comm_t[comm_t.topic==topic]
    for i in range(16): #sorted(pd.unique(data.comm)):
        fig.add_trace(
            go.Scatter(
                x=data[data.comm==i]['year'],
                y=data[data.comm==i]['comm_count'],
                mode='lines', name=hubs[topic][i],
            )
        )
    fig.update_layout(template='plotly_white',
                      title_text=topic,
                      xaxis={'range': [0,2100],
                             'title': 'year'},
                      yaxis={'title': '# nodes',
                             'range': [1,np.log10(np.max(data.comm_count))],
                             'type': 'linear'})
    fig.show()
    fig.write_image(f"{path_fig}/{path_plot}/count/{topic}.pdf")

## Growth

In [None]:
import os

if not os.path.exists(f"{path_fig}/{path_plot}/growth/"):
    os.mkdir(f"{path_fig}/{path_plot}/growth")

In [None]:
for topic in ['physics']: #networks.keys():
    fig = go.Figure()
    data = comm_t[comm_t.topic==topic]
    for i in range(16): #sorted(pd.unique(data.comm)):
        fig.add_trace(
            go.Scatter(
                x=data[data.comm==i]['year'],
                y=i*np.ones(len(data[data.comm==i].index)),
                mode='markers', name=hubs[topic][i],
            )
        )
    fig.update_layout(template='plotly_white',
                      title_text=topic,
                      showlegend=False,
                      xaxis={'range': [0,2100],
                             'title': 'year'},
                      yaxis={'title': 'communities (by hubs)',
                             'range': [-1,i+1]})
    fig.update_yaxes(
        tickvals=list(range(16)),
        ticktext=[hubs[topic][i] for i in range(0,16)]
    )
    fig.show()
    fig.write_image(f"{path_fig}/{path_plot}/growth/{topic}.pdf")

In [None]:
years

In [None]:
for topic in networks.keys():
    fig = go.Figure()
    data = comm_t[comm_t.topic==topic]
    for i in range(10): #sorted(pd.unique(data.comm)):
        years = data[data.comm==i].year
        x = list(range(min(years)-1, max(years)))
        y = np.zeros(len(x))
        for year in years:
            y[year-min(years)] += 1
        fig.add_trace(
            go.Scatter(x=x, y=y, mode='lines', name=hubs[topic][i])
        )
    fig.update_layout(template='plotly_white',
                      title_text=topic,
                      xaxis={'range': [0,2100],
                             'title': 'year'},
                      yaxis={'title': 'growth'})
    fig.show()
    fig.write_image(f"{path_fig}/{path_plot}/growth/{topic}.pdf")

# Change in community

Alternative conceptions to scientific revolution [link](https://plato.stanford.edu/entries/scientific-revolutions/#SomAltConSciRev)

> a prototype for revolutionary reorientation in the sciences. Just because it did not involve the introduction of additional objects or concepts, the transition from Newtonian to Einsteinian mechanics illustrates with particular clarity the scientific revolution as a displacement of the conceptual network through which scientists view the world. (Kuhn, 1970, 102)

Perhaps we can frame paradigm shifts as changes to the organization of knowledge as communities.

In [None]:
import os

path_plot = '5 modules'

if not os.path.exists(f"{path_fig}/{path_plot}"):
    os.mkdir(f"{path_fig}/{path_plot}")

In [None]:
import os

if not os.path.exists(f"{path_fig}/{path_plot}/community"):
    os.mkdir(f"{path_fig}/{path_plot}/community")

In [None]:
import plotly.express as px

px.colors.qualitative.Plotly[:3]

In [None]:
from networkx.algorithms.community import greedy_modularity_communities

## Multilayer

In [None]:
def compute_multinet(g):
    multinet = nx.DiGraph()
    years = sorted(nx.get_node_attributes(g, 'year').values())
    for i, year in enumerate(years):
        nodes = [node for node in g.nodes if g.nodes[node]['year']<=year]
        subgraph = nx.subgraph(g, nodes)
        multinet.add_nodes_from(
            [(f"{n}_{year}", {'year': g.nodes[n]['year']}) for n in subgraph.nodes]
        )
        multinet.add_edges_from(
            [(f"{s}_{year}", f"{t}_{year}") for s,t in subgraph.edges]
        )
        if i>0:
            prev_nodes = [
                node for node in g.nodes if g.nodes[node]['year']<=years[i-1]
            ]
            multinet.add_edges_from(
                [(f"{n}_{years[i-1]}", f"{n}_{year}") for n in prev_nodes],
                weight=0.0001
            )
    return multinet

In [None]:
topics = ['cognitive science']

In [None]:
multinets = {}
for topic in topics:
    multinets[topic] = compute_multinet(graph)

In [None]:
multicomms = {}
for topic in topics:
    multicomms[topic] = greedy_modularity_communities(nx.Graph(multinets[topic]))

In [None]:
for topic in [topic]: #topics:
    fig = go.Figure()
    graph = networks[topic].graph
    years = sorted(nx.get_node_attributes(graph, 'year').values())
    nodes = [n for y in years for n in graph.nodes if graph.nodes[n]['year']==y]
    for i, c in enumerate(multicomms[topic]):
        x = []
        y = []
        for node in c:
            name, year = node.split('_', 1)
            x.append(years.index(int(year)))
            y.append(nodes.index(name))
        fig.add_trace(
            go.Scatter(
                x=x, y=y,
                mode='markers',
                marker={'color': px.colors.qualitative.Plotly[i%10]},
                name=i
            )
        )
    fig.update_yaxes(ticktext=[], tickvals=[])
    fig.update_layout(
        template='plotly_white',
        title_text=topic,
        xaxis={'title': 'time'},
        yaxis={'title': 'nodes'}
    )
    fig.show()
# fig.write_image(f"{path_fig}/{path_plot}/community/{topic}.pdf")

## By layer

In [None]:
def compute_comm_by_layer(graph):
    comms = []
    years = sorted(nx.get_node_attributes(graph, 'year').values())
    for i, year in enumerate(years):
        nodes = [n for n in graph.nodes if graph.nodes[n]['year']<=year]
        if len(nodes)<3:
            comms += [[nodes]]
        else:
            subgraph = nx.subgraph(graph, nodes)
            comms += [greedy_modularity_communities(nx.Graph(subgraph))]
    return comms, years

In [None]:
for topic in ['cognitive science']: #topics:
    fig = go.Figure()
    graph = networks[topic].graph
    comms, years = compute_comm_by_layer(graph)
    nodes = [n for y in years for n in graph.nodes if graph.nodes[n]['year']==y]
    groups = []
    for i, year in enumerate(years):
        for j, c in enumerate(comms[i]):
            
            fig.add_trace(
                go.Scatter(
                    x=i*np.ones(len(c)), y=[nodes.index(n) for n in c],
                    mode='markers', name=j
                )
            )
    fig.update_yaxes(ticktext=[], tickvals=[])
    fig.update_layout(
        template='plotly_white',
        title_text=topic,
        xaxis={'title': 'time'},
        yaxis={'title': 'nodes'}
    )
    fig.show()
# fig.write_image(f"{path_fig}/{path_plot}/community/{topic}.pdf")

## Leiden

[link](http://netwiki.amath.unc.edu/GenLouvain/GenLouvain)
[leidenalg](https://leidenalg.readthedocs.io/en/latest/intro.html)

In [None]:
import scipy as sp
import leidenalg as la
import igraph as ig
import pickle

### iGraph

In [None]:
def networkx_to_igraph(nx_graph, vertex_id=None):
    nodes = list(nx_graph.nodes)
    ig_graph = ig.Graph()
    ig_graph.add_vertices(list(range(len(nodes))))
    ig_graph.vs['name'] = nodes
    ig_graph.vs['year'] = [nx_graph.nodes[n]['year'] for n in nodes]
    ig_graph.add_edges([
        (nodes.index(s), nodes.index(t)) for s,t in nx_graph.edges
    ])
    ig_graph.es['weight'] = [nx_graph.edges[s,t]['weight'] for s,t in nx_graph.edges]
    if vertex_id:
        ig_graph.vs['id'] = vertex_id
    return ig_graph

In [None]:
g = networkx_to_igraph(networks['earth science'].graph)
partition = la.find_partition(g, la.ModularityVertexPartition)
layout = g.layout('circle')
ig.plot(
    partition, layout=layout, bbox=(500, 500), margin=50,
    vertex_size=5, vertex_label_size=10,
    edge_width=0.1, edge_curved=True
)

### Temporal partition

In [None]:
graph = networks['earth science'].graph
nodes = list(graph.nodes)
years = sorted(nx.get_node_attributes(graph, 'year').values())
nodes_by_year = [
    [n for n in nodes if graph.nodes[n]['year']<=year]
    for year in years
]
membership, improvement = la.find_partition_temporal(
    [
        networkx_to_igraph(
            nx.subgraph(graph, nodes_by_year[i]),
            [nodes.index(n) for n in nodes_by_year[i]]
        )
        for i, year in enumerate(years)
    ],
    la.ModularityVertexPartition,
    interslice_weight=1,
)
membership[:3], improvement

### Function

In [None]:
def calc_num_changes(nodes, nodes_by_year, membership):
    membership_by_node = [
        [
            membership[y][ns.index(node)]
            for y, ns in enumerate(nodes_by_year)
            if node in ns
        ]
        for node in nodes
    ]
    memdiff = [
        np.diff(x, prepend=x[0]).astype(bool).astype(int)
        for x in membership_by_node
    ]
    matrix = np.zeros((len(nodes), len(years)))
    for i, diffs in enumerate(memdiff):
        matrix[i, len(years)-diffs.size:] = diffs
    return matrix.sum(axis=0)

### Plot example

In [None]:
import os

if not os.path.exists(os.path.join(path_fig, path_plot, 'community', f"{Cjrs}")):
    os.mkdir(os.path.join(path_fig, path_plot, 'community', f"{Cjrs}"))

In [None]:
len(networks['geology'].graph.nodes)

In [None]:
Cjrs = 0.001

In [None]:
# graph = networks['cognitive science'].graph
graph = networks['Boolean algebra'].graph
nodes = [n for n in graph.nodes]
# nodes.remove('Descriptive linguistics')
# nodes.remove('Alan Turing')
lesioned_graph = nx.subgraph(graph, nodes)

In [None]:
memberships = {}
improvements = {}
for topic in ['cognitive science']:
    fig = go.Figure()
#     graph = networks[topic].graph
    graph = lesioned_graph
    nodes = list(graph.nodes)
    sorted_nodes = sorted(
        nodes,
        key=lambda node: graph.nodes[node]['year']
    )
    years = sorted(set(nx.get_node_attributes(graph, 'year').values()))
#     years = sorted(nx.get_node_attributes(graph, 'year').values())
    nodes_by_year = [
        [n for n in nodes if graph.nodes[n]['year']<=year]
        for year in years
    ]
    memberships[topic], improvements[topic] = la.find_partition_temporal(
        [
            networkx_to_igraph(
                nx.subgraph(graph, nodes_by_year[i]),
                [nodes.index(n) for n in nodes_by_year[i]]
            )
            for i, year in enumerate(years)
        ],
        la.ModularityVertexPartition,
        interslice_weight=Cjrs,
        n_iterations=-1,
    )
    pickle.dump(
        (memberships, improvements),
        open(
            os.path.join(path_fig, path_plot, 'community', f"{topic}.pickle"),
            'wb'
        )
    )
    partitions = sorted(list(set([j for i in memberships[topic] for j in i])))
    for part in partitions:
        xy = [
            (i, sorted_nodes.index(n))
            for i, ns in enumerate(nodes_by_year)
            for j, n in enumerate(ns)
            if memberships[topic][i][j]==part
        ]
        fig.add_trace(
            go.Scatter(
                x=[i[0] for i in xy],
                y=[i[1] for i in xy],
                mode='markers',
                marker={'size': 3},
                name=part,
                hovertext=[sorted_nodes[i[1]] for i in xy]
            )
        )
    num_changes = calc_num_changes(sorted_nodes, nodes_by_year, memberships[topic])
    fig.add_trace(
        go.Scatter(
            x=np.arange(len(years)),
            y=num_changes,
            name='number of changes',
            marker_color='black',
            opacity=0.5
        )
    )
    fig.update_yaxes(ticktext=[], tickvals=[])
    fig.update_xaxes(
        tickvals=list(range(len(years)))[0::int(len(years)/10)],
        ticktext=years[0::int(len(years)/10)]
    )
    fig.update_layout(
        template='plotly_white',
        title_text=topic,
        xaxis={'title': 'years'},
        yaxis={'title': 'nodes'},
        legend={'x': 0, 'y': 1}
    )
    fig.show()
#     fig.write_image(
#         os.path.join(path_fig, path_plot, 'community', f"{Cjrs}", f"{topic}.pdf")
#     )

In [None]:
from scipy.stats import expon

for topic in ['cognitive science']:
#     graph = networks[topic].graph
    graph = lesioned_graph
    nodes = list(graph.nodes)
    years = sorted(set(nx.get_node_attributes(graph, 'year').values()))
    nodes_by_year = [
        [n for n in nodes if graph.nodes[n]['year']<=year]
        for year in years
    ]
#     norm_changes = changes / np.array([len(ns) for ns in nodes_by_year])
    norm_changes = num_changes * np.divide(
        np.array([len(ns) for ns in nodes_by_year]),
        np.ones(len(nodes_by_year)) * len(nodes)
    )
    hist, bin_edges = np.histogram(norm_changes, density=True, bins=20)
    bins = 0.5 * (bin_edges[:-1] + bin_edges[1:])
    fig = px.bar(x=bins, y=hist)
    x = np.arange(.2, np.max(bin_edges), .1)
    l = expon.fit(norm_changes)
    fig.add_trace(go.Scatter(x=x, y=l[1]*np.exp(-l[1]*x), mode='lines', opacity=0.5))
    fig.update_layout(
        width=600, height=300,
        template='plotly_white',
        title_text=topic, showlegend=False,
        xaxis={'title': 'number of changes'},
        yaxis={'title': 'density'}
    )
    fig.show()
#     fig.write_image(
#         os.path.join(path_fig, path_plot, 'community', f"{Cjrs}", f"distribution_{topic}.jpg")
#     )

In [None]:
len(num_changes), len(years), len(nodes_by_year), len(changes)

**Stability**

Field is unstable earlier on. Field becomes more stable later on.

In [None]:
year_max_delta = np.argmax(num_changes)+1
years[year_max_delta]

In [None]:
[n for n in graph.nodes if graph.nodes[n]['year']==years[year_max_delta]]

In [None]:
set(nodes_by_year[year_max_delta])-set(nodes_by_year[year_max_delta-1])

### Δmembership

Paradigm shift measured by how much the addition of a node changes the existing community structure.

In [None]:
len(memberships[topic]), len(nodes_by_year), len(nodes)

In [None]:
len(calc_num_changes(nodes, nodes_by_year, memberships[topic]))

In [None]:
membership_by_node = [
    [
        memberships[topic][y][ns.index(node)]
        for y, ns in enumerate(nodes_by_year)
        if node in ns
    ]
    for node in sorted_nodes
]

In [None]:
np.concatenate(
    (
        np.array([membership_by_node[0]]),
        np.array([np.diff(
            membership_by_node[0],
            prepend=membership_by_node[0][0]
        ).astype(bool).astype(int)])
    ),
    axis=0
)

In [None]:
memdiff = [np.diff(x, prepend=x[0]).astype(bool).astype(int) for x in membership_by_node]
memdiff[0:-1:30]

In [None]:
matrix = np.zeros((len(nodes), len(years)))
for i, diffs in enumerate(memdiff):
    matrix[i, len(years)-diffs.size:] = diffs
matrix

In [None]:
matrix.sum(axis=0)

In [None]:
len(matrix.sum(axis=0)), len(years), len(matrix.sum(axis=1)), len(nodes)

### Change point detection

In [None]:
from rpy2 import robjects
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector

In [None]:
cpt = importr('changepoint')
cpt

In [None]:
num_changes

In [None]:
FloatVector(num_changes)

In [None]:
pts = cpt.cpts(cpt.cpt_mean(FloatVector(num_changes), method='BinSeg'))
pts

In [None]:
pts = cpt.cpts(cpt.cpt_mean(FloatVector(num_changes), method='BinSeg', Q=2))
pts

In [None]:
pts = cpt.cpts(cpt.cpt_meanvar(FloatVector(num_changes), method='BinSeg', test_stat='Poisson', Q=2))
pts

In [None]:
pts = cpt.cpts(
    cpt.cpt_var(
        FloatVector(num_changes),
        method='BinSeg',
    )
)
pts

In [None]:
pts[0]

In [None]:
for topic in ['cognitive science']:
    fig = go.Figure()
#     graph = networks[topic].graph
    graph = lesioned_graph
    nodes = list(graph.nodes)
    sorted_nodes = sorted(
        nodes,
        key=lambda node: graph.nodes[node]['year']
    )
    years = sorted(set(nx.get_node_attributes(graph, 'year').values()))
#     years = sorted(nx.get_node_attributes(graph, 'year').values())
    nodes_by_year = [
        [n for n in nodes if graph.nodes[n]['year']<=year]
        for year in years
    ]
    partitions = sorted(list(set([j for i in memberships[topic] for j in i])))
    for part in partitions:
        xy = [
            (i, sorted_nodes.index(n))
            for i, ns in enumerate(nodes_by_year)
            for j, n in enumerate(ns)
            if memberships[topic][i][j]==part
        ]
        fig.add_trace(
            go.Scatter(
                x=[i[0] for i in xy],
                y=[i[1] for i in xy],
                mode='markers',
                marker={'size': 3},
                name=part,
                hovertext=[sorted_nodes[i[1]] for i in xy]
            )
        )
    num_changes = calc_num_changes(sorted_nodes, nodes_by_year, memberships[topic])
    fig.add_trace(
        go.Scatter(
            x=np.arange(len(years)),
            y=num_changes,
            name='number of changes',
            marker_color='black',
            opacity=0.5
        )
    )
#     for pt in pts:
    fig.add_trace(
        go.Scatter(
            x=np.array([pt if i<99 else None for pt in pts for i in range(100)]),
            y=np.array([i if i<99 else None for pt in pts for i in range(100)]),
            mode='lines', 
            line_dash='dash',
            name=f"change point",
            marker_color='rgba(0,0,0,1)'
        )
    )
    fig.update_yaxes(ticktext=[], tickvals=[])
    fig.update_xaxes(
        tickvals=list(range(len(years)))[0::int(len(years)/10)],
        ticktext=years[0::int(len(years)/10)]
    )
    fig.update_layout(
        template='plotly_white',
        title_text=topic,
        xaxis={'title': 'years'},
        yaxis={'title': 'nodes'},
        legend={'x': 0, 'y': 1}
    )
    fig.show()
#     fig.write_image(
#         os.path.join(path_fig, path_plot, 'community', f"{Cjrs}", f"{topic}.pdf")
#     )

## Leiden - cluster

### Runs

| Run |       ID      | Notes |
|:---:|:--------------|:------|
|  1  | 20200708_1221 | `C_jrs=0.01` |
|  2  | 20200709_1854 | Round year to 10s |
|  3  | 20200710_1348 | 1 & 2 |
|  4  | 20200715_2321 | `C_jrs=0.01`, null-target, networks in 20200708_1221 |
|  5  | 20200717_1405 | `C_jrs=0.01`, `n_iterations=4`, null-year, networks in 20200708_1221 |
|  6  | 20200718_1204 | `C_jrs=0.01`, `n_iterations=4`, null-year, networks not in 20200708_1221 |
|  7  | 20200718_1418 | `C_jrs=0.01`, `n_iterations=4`, null-target, 37 networks |
|  8  | 20200917_0008 | `C_jrs=0.02`, `n_iterations=-1`, real networks, cancelled, takes too long for some networks |
|  9  | 20200921_1106 | `C_jrs=0.02`, `n_iterations=2`, real networks |
|  10 | 20200921_1203 | same as `20200921_1106` for `Boolean algebra` |
|  11 | 20200921_1340 | same as `20200921_1106` for `linear algebra` |
|  12 | 20200921_1446 | same as `20200921_1106` for `law`, error `ValueError: std::exception` in `Optimiser.py` |
|  13 | 20200921_1557 | `C_jrs=0.01`, `n_iterations=2`, real networks, error for `law` |
|  14 | 20200921_1648 | `C_jrs=0.005`, `n_iterations=2`, real networks |
|  15 | 20200921_1902 | `C_jrs=0.001`, `n_iterations=10`, real networks |
|  16 | 20200921_2131 | `C_jrs=0.01`, `n_iterations=10`, real networks |
|  17 | 20200921_2348 | `C_jrs=0.02`, `n_iterations=10`, real networks |
|  18 | 20200922_1323 | `C_jrs=0.01`, `n_iterations=10`, jittered networks |

### Import data

In [None]:
import pickle
path_cluster = os.path.join(
    '/', 'Users', 'harangju', 'Developer', 'data', 'wiki', 'communities'
)

In [None]:
runs = [
#     '20200921_1106', '20200921_1203', '20200921_1340'
#     '20200921_1557'
#     '20200921_1902'
#     '20200921_2131'
#     '20200921_2348'
    '20200922_1323'
]

In [None]:
null = True
num_nulls = 1

In [None]:
filenames = {
    run: [
        filename
        for filename in os.listdir(os.path.join(path_cluster, run))
    ]
    for run in runs
}
filenames[list(filenames.keys())[0]][0]

In [None]:
memberships = {}
improvements = {}
for run, fs in filenames.items():
    for fname in fs:
        topic = fname.split('_', 1)[1].split('.')[0]
        memberships[topic], improvements[topic] = pickle.load(
            open(os.path.join(path_cluster, run, fname), 'rb')
        )

### Missing

In [None]:
runs = set([name.split('_')[1].split('.')[0] for run, names in filenames.items() for name in names])
missing_topics = set(topics) - set(runs)
{t: topics.index(t) for t in missing_topics}

### Modularity

In [None]:
g = networkx_to_igraph(networks['earth science'].graph)
partition = la.find_partition(g, la.ModularityVertexPartition)
layout = g.layout('circle')
ig.plot(
    partition, layout=layout, bbox=(200, 200), margin=20,
    vertex_size=5, vertex_label_size=10,
    edge_width=0.1, edge_curved=True
)

In [None]:
optimiser = la.Optimiser()
diff = optimiser.optimise_partition(la.ModularityVertexPartition(g))
diff

In [None]:
modularity = {topic: [] for topic in topics}
optimiser = la.Optimiser()

In [None]:
for topic, net in networks.items():
    modularity[topic].append(
        [
            optimiser.optimise_partition(
                la.ModularityVertexPartition(
                    networkx_to_igraph(nx.subgraph(
                        net.graph, [n for n in net.graph.nodes if net.graph.nodes[n]['year']<=year]
                    ))
                )
            )
            for year in sorted(set(nx.get_node_attributes(net.graph, 'year').values()))
        ]
    )

In [None]:
fig = go.Figure()
for topic, mods in modularity.items():
    for mod in mods:
        fig.add_trace(
            go.Scatter(
                x=np.arange(len(mod)), y=mod,
                name=topic
            )
        )
fig.update_layout(
    height=300,
    template='plotly_white',
    title_text=topic
)
fig.show()

### Plot

In [None]:
from rpy2 import robjects
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector

In [None]:
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector

if not rpackages.isinstalled('changepoint'):
    utils = rpackages.importr('utils')
    utils.chooseCRANmirror(ind=1)
    utils.install_packages(StrVector(['changepoint']))

In [None]:
cpt = importr('changepoint')
cpt

In [None]:
import IPython

Cjrs = 0.01
path_plot = '5 modules'

In [None]:
num_changes = {}
pts = {}
means = {}

In [None]:
for topic in memberships.keys():
    fig = ps.make_subplots(
        rows=2, cols=1,
        row_heights=[.6, .4]
    )
#         rows=3, cols=1,
#         row_heights=[.5, .25, .25]
#     )
    # get graph data
    if null:
        null_topic, null_iter = topic.split('_')
        graph = null_jitter[null_topic][int(null_iter)].graph
    else:
        graph = networks[topic].graph
    nodes = list(graph.nodes)
    sorted_nodes = sorted(
        nodes,
        key=lambda node: graph.nodes[node]['year']
    )
    years = sorted(set(nx.get_node_attributes(graph, 'year').values()))
    nodes_by_year = [
        [n for n in nodes if graph.nodes[n]['year']<=year]
        for year in years
    ]
    # plot partitions
    partitions = sorted(list(set([j for i in memberships[topic] for j in i])))
    for part in partitions:
        xy = [
            (i, sorted_nodes.index(n))
            for i, ns in enumerate(nodes_by_year)
            for j, n in enumerate(ns)
            if memberships[topic][i][j]==part
        ]
        fig.add_trace(
            go.Scatter(
                x=[i[0] for i in xy],
                y=[i[1] for i in xy],
                mode='markers',
                marker={'size': 2},
                name=f"community {part}", showlegend=False,
                hovertext=[sorted_nodes[i[1]] for i in xy]
            ),
            row=1, col=1
        )
    # plot number of changes
    num_changes[topic] = calc_num_changes(
        sorted_nodes, nodes_by_year, memberships[topic]
    )
    fig.add_trace(
        go.Scatter(
            x=np.arange(len(years)),
            y=num_changes[topic],
            name='number of changes', showlegend=False,
            marker_color='black',
            opacity=0.5
        ),
        row=2, col=1
    )
    # plot change point
    pts[topic] = cpt.cpts(
        cpt.cpt_meanvar(
            FloatVector(num_changes[topic]),
            test_stat='Poisson',
#             method='PELT',
            method='BinSeg', Q=3
        )
    )
    for i in [1, 2]:
        fig.add_trace(
            go.Scatter(
                x=[
                    pt-.5 if i<len(nodes)-1 else None
                    for pt in pts[topic] for i in range(len(nodes))
                ],
                y=[
                    i if i<len(nodes)-1 else None
                    for pt in pts[topic] for i in range(len(nodes))
                ],
                mode='lines',
                line_dash='dash',
                name=f"change point", showlegend=False,
                marker_color='rgba(0,0,0,0.5)'
            ),
            row=i, col=1
        )
    # means
    means[topic] = [
        np.mean(num_changes[topic][0:int(pt)]) if i==0
        else np.mean(num_changes[topic][int(pts[topic][i-1]):int(pt)])
        for i, pt in enumerate(list(pts[topic]) + [num_changes[topic].size])
    ] # which is the PEAK??
    fig.add_trace(
        go.Scatter(
            x=[
                p for pt in 
                [
                    np.arange(int(pt)) if i==0
                    else np.arange(int(pts[topic][i-1]), int(pt))
                    for i, pt in enumerate(list(pts[topic]) + [num_changes[topic].size])
                ]
                for p in pt
            ],
            y=[
                p for pt in 
                [
                    means[topic][i] * np.ones(int(pt)) if i==0
                    else means[topic][i] * np.ones(int(pt)-int(pts[topic][i-1]))
                    for i, pt in enumerate(list(pts[topic]) + [num_changes[topic].size])
                ]
                for p in pt
            ],
            mode='lines',
            name=f"means", showlegend=False,
            line={'color': 'green'},
            opacity=0.5
        ),
        row=2, col=1
    )
    # modularity
#     fig.add_trace(
#         go.Scatter(
#             x=np.arange(len(modularity[topic][0])),
#             y=modularity[topic][0],
#             showlegend=False
#         ),
#         row=3, col=1
#     )

    # figure layout
    fig.update_yaxes(
        title_text='nodes', ticktext=[], tickvals=[],
        row=1, col=1
    )
    fig.update_yaxes(
        title_text='Number of changes', 
        range=[0, 2*np.max(num_changes[topic]).astype(int)], 
        row=2, col=1
    )


    fig.update_yaxes(
        title_text='Modularity', 
        range=[0, 1], 
        row=3, col=1
    )
#     for i in [1, 2, 3]:

    for i in [1, 2]:
        fig.update_xaxes(
            tickvals=list(range(len(years)))[0::int(len(years)/10)],
            ticktext=years[0::int(len(years)/10)],
            range=[0, len(years)],
            row=i, col=1
        )
    fig.update_xaxes(title_text='years', row=2, col=1)
    fig.update_layout(
        template='plotly_white',
        title_text=topic
    )
    fig.show()
    fig.write_image(
        os.path.join(path_fig, path_plot, 'community', f"{Cjrs}_n10_jittered", f"{topic}.pdf")
    )
    IPython.display.clear_output(wait=True)
IPython.display.clear_output()

### vs PELT

In [None]:
import plotly.express as px

fig = px.histogram([len(v) for v in pts.values()])
fig.update_layout(
    width=360, height=360,
    template='plotly_white',
    showlegend=False,
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'PELT_Q_dstr.pdf'))

### Cavities & controllability

In [None]:
barcodes = pd.concat(
    [
        network.barcodes.assign(topic=topic)
        for topic, network in networks.items()
    ],
    ignore_index=True,
    sort=False
)
barcodes = barcodes[barcodes.lifetime!=0]
barcodes

In [None]:
import pickle
path_save = os.path.join('/','Users','harangju','Developer','data','wiki','analysis')
grams = pickle.load(open(os.path.join(path_save, 'grams.pickle'), 'rb'))

In [None]:
grams

In [None]:
import pickle
path_analysis = os.path.join('/','Users','harangju','Developer','data','wiki','analysis')
cavity_participation = pickle.load(
    open(os.path.join(path_analysis, 'cavity_participation.pickle'), 'rb')
)
cavity_participation

In [None]:
topic = 'biophysics'
graph = networks[topic].graph
nodes = list(graph.nodes)
years = sorted(set(nx.get_node_attributes(graph, 'year').values()))
nodes_by_year = [
    [n for n in nodes if graph.nodes[n]['year']<=year]
    for year in years
]

In [None]:
for topic in memberships.keys():
    if len(pts[topic])<3:
        print(topic, pts[topic])

In [None]:
set([
    n
    for pt in range(int(pts[topic][1]), int(pts[topic][2]+1))
    for n in nodes if graph.nodes[n]['year']==years[pt]
])

In [None]:
for topic in memberships.keys():
    graph = networks[topic].graph
    years = sorted(set(nx.get_node_attributes(graph, 'year').values()))
    nodes = set(graph.nodes)
    if len(pts[topic]) < 3:
        print(topic)
        continue
    nodes_spike = set([
        n for pt in range(int(pts[topic][1]), int(pts[topic][2]+1))
        for n in nodes if graph.nodes[n]['year']==years[pt]
    ])
    nodes_diff = nodes - nodes_spike
    ks_cntr, p_ks_cntr = sp.stats.ks_2samp(
        grams.loc[nodes_spike]['CG_5'].values,
        grams.loc[nodes_diff]['CG_5'].values,
        alternative='two-sided'
    )
    ks_cavity_birth, p_ks_cavity_birth = sp.stats.ks_2samp(
        cavity_participation.loc[nodes_spike]['birth simplex'].values,
        cavity_participation.loc[nodes_diff]['birth simplex'].values,
        alternative='two-sided'
    )
    ks_cavity_death, p_ks_cavity_death = sp.stats.ks_2samp(
        cavity_participation.loc[nodes_spike]['death simplex'].values,
        cavity_participation.loc[nodes_diff]['death simplex'].values,
        alternative='two-sided'
    )
    t_cntr, p_t_cntr = sp.stats.ttest_ind(
        grams.loc[nodes_spike]['CG_5'].values,
        grams.loc[nodes_diff]['CG_5'].values,
        equal_var=True
    )
    t_cavity_birth, p_t_cavity_birth = sp.stats.ttest_ind(
        cavity_participation.loc[nodes_spike]['birth simplex'].values,
        cavity_participation.loc[nodes_diff]['birth simplex'].values,
        equal_var=True
    )
    t_cavity_death, p_t_cavity_death = sp.stats.ttest_ind(
        cavity_participation.loc[nodes_spike]['death simplex'].values,
        cavity_participation.loc[nodes_diff]['death simplex'].values,
        equal_var=True
    )
    print(topic, '\n\t', 'ks')
    print('\t', 'cntr', ks_cntr, p_ks_cntr)
    print('\t', 'birth', ks_cavity_birth, p_ks_cavity_birth)
    print('\t', 'death', ks_cavity_death, p_ks_cavity_death)
    print(topic, '\n\t', 't')
    print('\t', 'cntr', t_cntr, p_t_cntr)
    print('\t', 'birth', t_cavity_birth, p_t_cavity_birth)
    print('\t', 'death', t_cavity_death, p_t_cavity_death)

### Plot shifts

In [None]:
num_changes

In [None]:
from scipy.stats import expon

for topic, changes in num_changes.items():
    graph = networks[topic].graph
    nodes = list(graph.nodes)
    years = sorted(set(nx.get_node_attributes(graph, 'year').values()))
    nodes_by_year = [
        [n for n in nodes if graph.nodes[n]['year']<=year]
        for year in years
    ]
#     norm_changes = changes / np.array([len(ns) for ns in nodes_by_year])
    norm_changes = changes * np.divide(
        np.array([len(ns) for ns in nodes_by_year]),
        np.ones(len(nodes_by_year)) * len(nodes)
    )
    norm_changes = changes / np.array([len(ns) for ns in nodes_by_year])
    hist, bin_edges = np.histogram(norm_changes, density=True, bins=20)
    bins = 0.5 * (bin_edges[:-1] + bin_edges[1:])
    fig = px.bar(x=bins, y=hist)
    x = np.arange(.5, np.max(bin_edges), .1)
    l = expon.fit(norm_changes)
    fig.add_trace(go.Scatter(x=x, y=l[1]*np.exp(-l[1]*x), mode='lines', opacity=0.5))
    fig.update_layout(
        width=600, height=300,
        template='plotly_white',
        title_text=topic, showlegend=False,
        xaxis={'title': 'number of changes'},
        yaxis={'title': 'density'}
    )
    fig.show()
    fig.write_image(
        os.path.join(path_fig, path_plot, 'community', f"{Cjrs}", f"distribution_{topic}.jpg")
    )

### Distributions

In [None]:
topics_memberships = list(memberships.keys())

In [None]:
all_durs = [
    diff
    for topic in topics_memberships
    for diff in np.diff(
        [0] + list(pts[topic]) + [
            len(set(nx.get_node_attributes(networks[topic].graph, 'year').values()))
        ]
    )
]
hist_durs, bins_durs = np.histogram(all_durs, bins=20)
bins_durs = 0.5 * (bins_durs[:-1] + bins_durs[1:])
bins_durs, hist_durs

In [None]:
years = {
    topic: list(set(nx.get_node_attributes(networks[topic].graph, 'year').values()))
    for topic in topics_memberships
}

In [None]:
all_durs_year = [
    abs(diff)
    for topic in topics_memberships
    for diff in np.diff(
        [years[topic][0]] + [years[topic][y] for y in pts[topic]] + [years[topic][-1]]
    )
]
hist_durs_year, bins_durs_year = np.histogram(all_durs_year, bins=20)
bins_durs_year = 0.5 * (bins_durs_year[:-1] + bins_durs_year[1:])
bins_durs_year, hist_durs_year

In [None]:
all_means = [
    m
    for topic in topics_memberships
    for m in means[topic]
]
hist_means, bins_means = np.histogram(all_means, bins=20)
bins_means = 0.5 * (bins_means[:-1] + bins_means[1:])
bins_means, hist_means

In [None]:
fig = ps.make_subplots(
    rows=2, cols=3
)
fig.add_trace(
    go.Box(x=bins_durs, y=hist_durs),
    row=1, col=1
)
fig.add_trace(
    go.Box(x=bins_means, y=hist_means),
    row=1, col=2
)
fig.add_trace(
    go.Box(x=bins_durs_year, y=hist_durs_year),
    row=1, col=3
)
fig.add_trace(
    go.Scatter(
        x=all_durs, y=all_means, mode='markers', marker_size=3,
        hovertext=[topic for topic in topics_memberships for _ in range(len(means[topic]))]
    ),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(
        x=all_durs_year, y=all_means, mode='markers', marker_size=3,
        hovertext=[topic for topic in topics_memberships for _ in range(len(means[topic]))]
    ),
    row=2, col=2
)
fig.update_xaxes(title_text='durations (steps)', row=1, col=1)
fig.update_yaxes(title_text='counts', row=1, col=1)
fig.update_xaxes(title_text='means', row=1, col=2)
fig.update_yaxes(title_text='counts', row=1, col=2)
fig.update_xaxes(title_text='durations (years)', row=1, col=3)
fig.update_yaxes(title_text='counts', row=1, col=3)
fig.update_xaxes(title_text='durations (steps)', row=2, col=1)
fig.update_yaxes(title_text='means', row=2, col=1)
fig.update_xaxes(title_text='durations (years)', row=2, col=2)
fig.update_yaxes(title_text='means', row=2, col=2)
fig.update_layout(
    height=800,
    template='plotly_white',
    title_text='Paradigms',
    showlegend=False
)
fig.show()

### Signature

In [None]:
topics_memberships = list(memberships.keys())

In [None]:
colors_hex = [h[1:] for h in px.colors.qualitative.Plotly]
colors_rgb = [
    [
        int(h[i:i+2], 16)
        for i in (0, 2, 4)
    ]
    for h in colors_hex
]
colors_rgb

In [None]:
if null:
    years = {
        topic: sorted(set(nx.get_node_attributes(
            null_jitter[topic.split('_')[0]][int(topic.split('_')[1])].graph, 'year'
        ).values()))
        for topic in topics_memberships
    }
else:
    years = {
        topic: sorted(set(nx.get_node_attributes(networks[topic].graph, 'year').values()))
        for topic in topics_memberships
    }

In [None]:
# BigSeg, Q=3
Q = 3
epochs = Q+1
fig = go.Figure()
fig = ps.make_subplots(
    rows=1, cols=3
)
means_array = np.array([
    [0]*(epochs-len(m)) + m
    for t, m in means.items()
])
pts_array = np.array([
    np.diff(
        [0]*(epochs-len(pt)) + [int(p) for p in pt] + [len(years[t])-1]
    )
    for t, pt in pts.items()
])
years_array = np.array([
    np.diff(
        [years[t][0]]*(epochs-len(pt)) + [years[t][int(p)] for p in pt] + [years[t][-1]]
#         [
#             years[t][p]
#             for p in [0]*(epochs-len(pt)) + [int(p) for p in pt] + [len(years[t])-1]
#         ]
    )
    for t, pt in pts.items()
])
for i in range(means_array.shape[0]):
    fig.add_trace(
        go.Scatter(
            x=1+np.arange(epochs),
            y=means_array[i,:],
            mode='lines',
            hovertext=topics_memberships[i],
            line={
                'color': f"rgba({colors_rgb[i%10][0]},{colors_rgb[i%10][1]},{colors_rgb[i%10][2]},0.5)"
            }
        ),
        row=1, col=1
    )
for i in range(pts_array.shape[0]):
    color = f"rgba({colors_rgb[i%10][0]},{colors_rgb[i%10][1]},{colors_rgb[i%10][2]},0.5)"
    fig.add_trace(
        go.Scatter(
            x=1+np.arange(epochs),
            y=pts_array[i,:],
            mode='lines',
            hovertext=topics_memberships[i],
            line={'color': color}
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=1+np.arange(epochs),
            y=years_array[i,:],
            mode='lines',
            hovertext=topics_memberships[i],
            line={'color': color}
        ),
        row=1, col=3
    )
fig.add_trace(
    go.Scatter(
        x=1+np.arange(epochs),
        y=means_array.mean(axis=0),
        mode='lines',
        line={'color': 'rgba(0,0,0,1)'}
    ),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(
        x=1+np.arange(epochs),
        y=pts_array.mean(axis=0),
        mode='lines',
        line={'color': 'rgba(0,0,0,1)'}
    ),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(
        x=1+np.arange(epochs),
        y=years_array.mean(axis=0),
        mode='lines',
        line={'color': 'rgba(0,0,0,1)'}
    ),
    row=1, col=3
)
fig.update_xaxes(title_text='epoch', range=[.5, Q+1.5])
fig.update_yaxes(title_text='avg # changes', row=1, col=1)
fig.update_yaxes(title_text='duration (steps)', row=1, col=2)
fig.update_yaxes(title_text='duration (years)', row=1, col=3)
fig.update_layout(
    height=400,
    template='plotly_white',
    title_text='Signature',
    showlegend=False
)
fig.show()
fig.write_image(
    os.path.join(path_fig, path_plot, 'community', f"{Cjrs}_n10_jittered", f"signature.pdf")
)

In [None]:
len(sorted([topics_memberships[i] for i, m in enumerate(means_array) if m[1]>m[2]]))/len(topics)/10

## Leiden vs Nobel/IR

## Leiden vs Fields