In [84]:
import pandas as pd
import numpy as np
import urllib.parse

encodedStr = 'Hell%C3%B6%20W%C3%B6rld%40Python'
urllib.parse.unquote(encodedStr)

# Graph is defined in file `links.tsv`
df = (pd.read_csv("wikispeedia/wikispeedia_paths-and-graph/links.tsv", 
                  skiprows=12, 
                  sep="\t", 
                  header=None)
      .applymap(urllib.parse.unquote)
      .applymap(str.strip)
      .rename(columns={0:"out",
                       1:"in"})
     )

df.sample(5)

Unnamed: 0,out,in
88280,Prague,Cyprus
56741,Isospin,Symmetry
3509,Africa,Indian_Ocean
70409,Marxism,World_War_II
106216,Thallium,Neutron


In [76]:
# Describe df

print(f"There are {df.shape[0]} edges")
print(f"connecting {df.stack().nunique()} unique nodes in the network.")
print(f"\n\t * {df['out'].nunique()} nodes have out-links.")
print(f"\t * {df['in'].nunique()} nodes have in-links.\n")

print(f"This means {df.stack().nunique() - df['out'].nunique()} nodes are dead-ends (no out-links),")
print(f"and {df.stack().nunique() - df['in'].nunique()} \
nodes can never be reached from any other article (no in-links) ☠️")

There are 119882 edges
connecting 4592 unique nodes in the network.

	 * 4587 nodes have out-links.
	 * 4135 nodes have in-links.

This means 5 nodes are dead-ends (no out-links),
and 457 nodes can never be reached from any other article (no in-links) ☠️


In [82]:
# nodes with no outlink (dead-ends)
nodes_no_outlink = (set(df.stack()
                        .unique())
                    .difference(df['out']
                                .unique()))
print(len(nodes_no_outlink))
nodes_no_outlink

5


{'Directdebit',
 'Duchenne_muscular_dystrophy',
 "Klinefelter's_syndrome",
 'Local_community',
 'Osteomalacia'}

In [83]:
# Let's see which articles can't be reached from any other
nodes_no_inlink = list((set(df.stack()
                            .unique())
                        .difference(df['in']
                                    .unique())))
print(len(nodes_no_inlink))

nodes_no_inlink[:5]

457


['International_English',
 'Opuntia_imbricata',
 'Common_Basilisk',
 'Control_car_(rail)',
 'Greater_Cane_Rat']

# TODO

- Distribution of in-links and out-links per node
- Test hypothesis: visit frequency correlates with node in-degree.