In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import multiprocessing
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/gatto/ds-network-analysis/08ca1053ffb0bf491025726e7264e1d65d99303a/py/net_building/df_final_nodes.csv", index_col=0 )

In [3]:
df

Unnamed: 0,node,category
0,290883,pax
1,1471971,proukr
2,2052911,pax
3,2827221,dontcare
4,3688111,proukr
...,...,...
10179,1502695274241826818,pax
10180,1502720034971521026,proukr
10181,1502799611379036163,prorus
10182,1502964238033297408,proukr


In [4]:
df_sample = df[0:10]
# for real data do: df_sample = df.sample(frac = 0.1, axis = "index") # frac is the fraction of rows nodes to start with
df_sample

Unnamed: 0,node,category
0,290883,pax
1,1471971,proukr
2,2052911,pax
3,2827221,dontcare
4,3688111,proukr
5,4254951,proukr
6,4615341,proukr
7,4937571,pax
8,5404582,pax
9,5516892,proukr


In [5]:
df_sample_set = set(df_sample["node"])
df_sample_set

{290883,
 1471971,
 2052911,
 2827221,
 3688111,
 4254951,
 4615341,
 4937571,
 5404582,
 5516892}

In [6]:
links = pd.read_csv("https://raw.githubusercontent.com/gatto/ds-network-analysis/main/py/net_building/df_final_edges.csv",index_col=0)

In [7]:
# linkage step: now i need to only take the links that are from or to a node included in the sample.
# these links are the "chosen links"
# second step is take the nodes that "chosen links" point to or come from (same thing) and add these nodes to the set
# end.
# if i then repeat the "linkage step", i would get gradually a bigger net until i eventually get all the net.
# therefore I just end where I end (after only one "linkage step" and one addition of "chosen links")
# mindful that this subnet as composed does not include all links originating from nodes in the subnet.
# therefore I'm not sure if this approach is sound or not. Can't think of a better one, though.
links_sample = links[0:10]
links_sample

Unnamed: 0,source,target
0,290883,2956836471
1,1471971,1205226529455632385
2,2052911,1401015246718914560
3,2052911,1353366187791560704
4,2052911,2202063265
5,2052911,1317817827970134025
6,2052911,316797199
7,2827221,4018888580
8,3688111,1499295169870319616
9,4254951,282076470


In [8]:
# for testing purposes only I want to add at least a node that has 2 links in links_sample
print(links_sample.iloc[9])
links_sample = links_sample.append(pd.Series({"source":6969, "target":290883}), ignore_index=True)
links_sample

source      4254951
target    282076470
Name: 9, dtype: int64


Unnamed: 0,source,target
0,290883,2956836471
1,1471971,1205226529455632385
2,2052911,1401015246718914560
3,2052911,1353366187791560704
4,2052911,2202063265
5,2052911,1317817827970134025
6,2052911,316797199
7,2827221,4018888580
8,3688111,1499295169870319616
9,4254951,282076470


In [9]:
g = nx.Graph()
g.add_nodes_from(df_sample_set)
g.nodes

NodeView((290883, 1471971, 4937571, 5404582, 4254951, 4615341, 3688111, 2052911, 2827221, 5516892))

In [10]:
for node in g.nodes:
    print(f"Working on {node}")

    one = links_sample[links_sample["source"] == node]
    two = links_sample[links_sample["target"] == node]
    subset_of_links = pd.concat((one, two))
    if len(subset_of_links) > 1:
        print(f"We found {len(subset_of_links)} links")

    if not subset_of_links.empty:
        new_graph = nx.from_pandas_edgelist(subset_of_links)
        g = nx.compose(g, new_graph)

Working on 290883
We found 2 links
Working on 1471971
Working on 4937571
Working on 5404582
Working on 4254951
Working on 4615341
Working on 3688111
Working on 2052911
We found 5 links
Working on 2827221
Working on 5516892


In [11]:
g.nodes

NodeView((290883, 1471971, 4937571, 5404582, 4254951, 4615341, 3688111, 2052911, 2827221, 5516892, 2956836471, 6969, 1205226529455632385, 282076470, 1499295169870319616, 1401015246718914560, 1353366187791560704, 2202063265, 1317817827970134025, 316797199, 4018888580))

In [12]:
g.edges

EdgeView([(290883, 2956836471), (290883, 6969), (1471971, 1205226529455632385), (4254951, 282076470), (3688111, 1499295169870319616), (2052911, 1401015246718914560), (2052911, 1353366187791560704), (2052911, 2202063265), (2052911, 1317817827970134025), (2052911, 316797199), (2827221, 4018888580)])

In [13]:
len(g.nodes)

21

In [14]:
len(df_sample)

10