# ADM Homework 5 - Group 33
Flaminia Matteucci, Francesca Porcu, Michele Fortunato, Pol Ribó León  

## Homework 5 - Visit the Wikipedia hyperlinks graph!

In this assignment we perform an analysis of the Wikipedia Hyperlink graph. In particular, given extra information about the categories to which an article belongs to, we are curious to rank the articles according to some criteria.

For this purpose we use the Wikipedia graph released by the SNAP group.

In [1]:
# Prepare the environment
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook

Let's start importing the files

In [2]:
data_link = pd.read_csv(r'C:\Users\franc\OneDrive\Desktop\Francesca\ADM\Homeworks\HW5\wiki-topcats-reduced.txt', sep="\t", header=None, engine='python')   
data_link.columns=['start', 'end']
data_link.head()

Unnamed: 0,start,end
0,52,401135
1,52,1069112
2,52,1163551
3,62,12162
4,62,167659


In [3]:
data_categories = pd.read_csv(r'C:\Users\franc\OneDrive\Desktop\Francesca\ADM\Homeworks\HW5\wiki-topcats-categories.txt.gz', sep='\n')
data_categories.columns = ['Categories']

We need to select only the categories with more than 3500 articles.

In [4]:
l1 = []
l2 = []
for row in data_categories['Categories']:
    a = [int(s) for s in row.split() if s.isdigit()]
    l1.append(a)
    b = ''.join([i for i in row.split() if not i.isdigit()]).replace(";", "").replace("Category:", "").replace("_", " ")
    l2.append(b)

In [5]:
df_articles = pd.DataFrame({'List of articles':l1})
df_categories = pd.DataFrame(l2, columns = ['Categories'])

df_full = df_articles.combine_first(df_categories)

In [6]:
l = []
for item in df_full['List of articles']:
    l.append(len(item))
    
df_full['Length'] = pd.Series(l)

df_full = df_full[df_full.Length > 3500] # Remove items with lenght < 3500

df_full.index = pd.np.arange(0,len(df_full)) # Reindex

df_full

Unnamed: 0,Categories,List of articles,Length
0,English footballers,"[22860, 28411, 28961, 28979, 29264, 29573, 295...",9237
1,The Football League players,"[14003, 23536, 27109, 27348, 27459, 27989, 280...",9467
2,Association football forwards,"[26876, 26877, 26879, 26887, 26892, 26904, 269...",6959
3,Association football goalkeepers,"[26900, 26909, 26917, 26960, 26966, 26984, 270...",3997
4,Association football midfielders,"[14003, 15291, 23536, 26880, 26882, 26885, 268...",8270
5,Association football defenders,"[15217, 22860, 26873, 26878, 26881, 26898, 269...",6668
6,Living people,"[52, 62, 64, 66, 70, 74, 79, 96, 103, 104, 105...",418223
7,Year of birth unknown,"[3335, 10527, 15076, 16310, 22286, 23468, 2346...",3760
8,Harvard University alumni,"[77, 1013, 1271, 1663, 1779, 1843, 2212, 3193,...",6154
9,Major League Baseball pitchers,"[79, 24213, 33054, 37167, 53973, 63107, 69823,...",6580


In [7]:
dict_df_full = df_full['List of articles'].to_dict()

In [8]:
data_page_names = pd.read_table(r'C:\Users\franc\OneDrive\Desktop\Francesca\ADM\Homeworks\HW5\wiki-topcats-page-names.txt.gz', compression='gzip', header=None, sep='\n')

In [9]:
#creating graph 
G=nx.from_pandas_edgelist(data_link, 'start', 'end', create_using=nx.DiGraph())

In [10]:
G.is_directed()

True

In [11]:
G.number_of_nodes()

461193

In [12]:
G.number_of_edges()

2645247

We need to complete the data_link dataframe adding the rows referring to nodes that don't have any edge

In [13]:
missing = data_link[~data_link['end'].isin(data_link['start'])]
missing['set'] = "None"

missing = missing.drop(columns=['start'])

missing.columns = ['start','end']

missing.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,start,end
7,1354553,
53,136,
94,645690,
199,114662,
238,1035,


In [14]:
data_link_full = data_link.append(missing)
data_link_full.head()

Unnamed: 0,start,end
0,52,401135
1,52,1069112
2,52,1163551
3,62,12162
4,62,167659


We can now create the adiacency list.

In order to speed up the calculations, let's filter the data link removing all the edges not belonging to the selected categories.

First, we create a set with all the ids we need.

In [15]:
id_list = []

for articles_list in df_full['List of articles']:
    id_list = id_list + articles_list
    
id_list = list(set(id_list))

Now we can filter the data_link. We can exclude edges where 'start' and 'end' are not present in the id_list

In [16]:
data_link_filtered = data_link_full[
    data_link_full['start'].isin(id_list)
    & data_link_full['end'].isin(id_list)
]

data_link_filtered

Unnamed: 0,start,end
0,52,401135
1,52,1069112
2,52,1163551
3,62,12162
4,62,167659
5,62,279122
6,62,1089199
7,62,1354553
8,62,1400636
9,62,1403619


In [17]:
data_link_grouped = data_link_filtered.groupby('start')

adiacency_list = {}

for key,item in tqdm(data_link_grouped, total=len(data_link_grouped)):
    if data_link_grouped.get_group(key)["end"].tolist()[0] == 'None':
        adiacency_list[key] = set()
    else:
        adiacency_list[key] = set(data_link_grouped.get_group(key)["end"].tolist())
    
adiacency_list

100%|██████████| 428957/428957 [04:35<00:00, 1555.03it/s]


{52: {401135, 1069112, 1163551},
 62: {12162,
  167659,
  279122,
  1089199,
  1354553,
  1400636,
  1403619,
  1537692,
  1544420},
 64: {64873},
 66: {279122, 1163290},
 74: {279122},
 95: {1179478, 1185516, 1185519, 1230865},
 103: {107},
 104: {107, 1174251, 1174302},
 105: {1174967},
 107: {104, 142167, 657930, 1013995, 1061780, 1174302, 1179210, 1181401},
 108: {104, 107, 1059989, 1062426, 1161925, 1169534},
 112: {107},
 113: {279122, 1185127},
 122: {1174967},
 126: {279122},
 133: {279122, 825464},
 134: {541222, 1060414, 1061485, 1062226, 1163610, 1163783, 1265505},
 137: {136, 1163286, 1163712, 1224802},
 153: {154,
  260842,
  725249,
  744272,
  1064807,
  1065282,
  1065320,
  1066493,
  1085943,
  1125916,
  1130618,
  1161083,
  1162104,
  1162238,
  1162816,
  1532744},
 154: {153, 279122, 673672, 1057077, 1063928, 1067637, 1175075},
 155: {279122, 1539938},
 156: {279122, 785921},
 158: {279122, 1262459, 1400635, 1403118},
 160: {598053, 1108161},
 163: {896828},
 166

In [18]:
search = input('Choose a category: ') # English television actors

query = df_full[df_full['Categories'] == search]

for index, row in query.iterrows():
    C0 = row['List of articles']

Choose a category: English television actors


In [None]:
#C1 = df_full['List of articles'][1] # The Football League players

In [None]:
# remove all articles of C0 that are in C1:

#C1_set = [item for item in C1 if item not in C0]

In [None]:
#for C0 in tqdm(C0, total=len(C0), desc='C0 loop', position=0):
    #bfs(adiacency_list, C0)

# BFS

In [34]:
d = {key: [] for key in id_list}

In [35]:
import collections

def bfs(graph, root):
    visited, queue = {root}, collections.deque([(root, 0)])
    depth = 0
    while queue:
        vertex, level = queue.popleft()
        if level > depth:
            depth += 1
        try:
            for neighbour in graph[vertex]:
                if neighbour not in visited:
                    visited.add(neighbour)
                    queue.append((neighbour, depth + 1))
                    d[neighbour] += [depth + 1]
        except:
            pass

In [36]:
for i in C0[:100]:
    bfs(adiacency_list, i)

In [32]:
#bfs(adiacency_list, 1035)

In [37]:
d

{1048576: [9,
  10,
  10,
  10,
  9,
  9,
  10,
  10,
  11,
  10,
  10,
  10,
  9,
  9,
  10,
  10,
  10,
  10,
  10,
  10,
  9,
  10,
  10,
  11,
  12,
  9,
  9,
  10,
  9,
  11,
  10,
  9,
  10,
  9,
  10,
  10,
  10,
  10,
  10,
  9,
  9,
  10,
  10,
  10,
  10,
  9,
  10,
  9,
  10,
  10,
  9,
  10,
  11,
  9,
  9,
  10,
  10,
  10,
  10,
  10,
  9,
  9,
  11,
  11,
  10,
  10,
  9,
  10,
  11,
  10,
  9,
  10,
  10,
  9,
  9,
  9,
  10,
  9,
  9,
  9,
  9,
  10,
  10,
  10],
 1048577: [8,
  9,
  9,
  9,
  8,
  8,
  9,
  9,
  10,
  9,
  9,
  9,
  8,
  8,
  9,
  9,
  9,
  9,
  9,
  9,
  8,
  9,
  9,
  10,
  11,
  8,
  8,
  9,
  8,
  10,
  9,
  8,
  9,
  8,
  9,
  9,
  9,
  9,
  9,
  8,
  8,
  9,
  9,
  9,
  9,
  8,
  9,
  8,
  9,
  9,
  8,
  9,
  10,
  8,
  8,
  9,
  9,
  9,
  9,
  9,
  8,
  8,
  10,
  10,
  9,
  9,
  8,
  9,
  10,
  9,
  8,
  9,
  9,
  8,
  8,
  8,
  9,
  8,
  8,
  8,
  8,
  9,
  9,
  9],
 1048578: [7,
  8,
  8,
  8,
  7,
  7,
  8,
  8,
  9,
  8,
  8,
  8,
  7,
  7