# Network graph with Impresso Py

<a target="_blank" href="https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/4-impresso-py/network_graph.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Install dependencies

In [None]:
%pip install git+https://github.com/impresso/impresso-py.git ipysigma

## Connect to Impresso

In [None]:
from impresso import connect, OR, AND

impresso = connect(public_api_url="https://dev.impresso-project.ch/public-api")

## Part 1: Get entities and their co-occurrences

Find all persons mentioned in all articles that talk about the [Prague Spring](https://en.wikipedia.org/wiki/Prague_Spring).

In [None]:
query = OR("Prague Spring", "Prager Frühling", "Printemps de Prague")

In [None]:
persons = impresso.search.facet(
  facet="person",
  q=query,
  order_by="-count",
  limit=100
)
persons

Get all combinations of all entities with a mention count higher than `N`.

In [None]:
import itertools

n = 10

df = persons.df
df = df[df["count"] > n]
persons_ids = df.uid.tolist()
print(f"Total persons selected: {len(persons_ids)}")

persons_ids_combinations = list(itertools.combinations(persons_ids, 2))
print(f"Total combinations: {len(persons_ids_combinations)}")

In [None]:
if len(persons_ids_combinations) > 500:
  msg = (
      f"The number of combinations is quite high ({len(persons_ids_combinations)}). " +
      "This may put a lot of load on Impresso and your requests may be throttled. " +
      "Try to increase the threshold number of mentions in the cell above which will reduce the number of selected persons. " +
      "You can also disable this error by commenting out this cell, if this number of combinations is expected."
  )
  raise Exception(msg)

Get timestamps and counts of all articles where persons pairs appear.

In [None]:
from impresso.util.error import ImpressoError
from time import sleep

connections = []

for idx, combo in enumerate(persons_ids_combinations):
  try:
    result = impresso.search.facet(
      facet="daterange",
      q=query,
      entity_id=AND(*combo),
      limit=1000
    )
  except ImpressoError as e:
    if e.error.status == 429:
      print(f"Sleeping because of {e}")
      sleep(2)

  if result.size > 0:
    df = result.df

    items = list(zip(df.index.tolist(), df['count'].tolist(), [result.url for i in range(len(df))]))
    connections.append((combo, items))
    # print(f"Found {len(df)} connections for {combo}", end=' ')

  if idx % 10 == 0:
    print(f"{idx} done out of {len(persons_ids_combinations)}")

Put them all into a dataframe

In [None]:
import pandas as pd

connections_denormalised = []
for c in connections:
  nodes, edges = c
  # print(nodes)

  connections_denormalised.extend(
    [[node_a, node_b, ts, count, url] for (node_a, node_b), (ts, count, url) in zip([nodes for i in range(len(edges))], edges)]
  )

connections_df = pd.DataFrame(connections_denormalised, columns=('node_a', 'node_b', 'timestamp', 'count', 'url'))
connections_df

In [None]:
connections_df.to_csv("connections.csv")

## Part 2: visualise

In [None]:
import pandas as pd

connections_df = pd.read_csv("connections.csv")
connections_df

In [None]:
grouped_connections_df = connections_df.groupby(['node_a', 'node_b']) \
    .agg({'timestamp': lambda x: ', '.join(list(x)), 'count': 'sum', 'url': lambda x: list(set(x))[0]}) \
    .reset_index()
grouped_connections_df

In [None]:
import networkx as nx

G = nx.from_pandas_edgelist(
    grouped_connections_df,
    source='node_a',
    target='node_b',
    edge_attr=['count', 'url'],
    create_using=nx.MultiGraph()
)
for i in sorted(G.nodes()):
    G.nodes[i]['url'] = f"https://impresso-project.ch/app/entities/{i}"
G.nodes

In [None]:
filename = input("Enter the filename: ")
filename = f"{filename.replace(' ', '_')}.gefx"

In [None]:
nx.write_gexf(G, filename)

If running in Colab - activate custom widgets to allow Sigma to render the graph.

In [None]:
try:
    from google.colab import output
    output.enable_custom_widget_manager()
except:
    pass

Render the graph.

In [None]:
import networkx as nx
from ipysigma import Sigma

# Importing a gexf graph
g = nx.read_gexf(filename)

# Displaying the graph with a size mapped on degree and
# a color mapped on a categorical attribute of the nodes
Sigma(g, node_size=g.degree, edge_size='count', clickable_edges=True)