In [29]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from collections import OrderedDict

### what is done:
construct Wikispeedia graph
run page rank
process the 'finished paths': 
    <br> - take into account back clicks to consider only the effective path
    <br> - 'processed_paths' contains all the finished paths starting from the 'hub' (the page with highest page rank in that path)


#### see next step at the bottom

##### please update this cell to keep track of prject progress

In [30]:
links = pd.read_csv("wikispeedia_paths-and-graph/links.tsv", sep='\t', skiprows = 11, names = ['link_source', 'link_target'])
# links.tsv contains all links between articles. It can be used to contruct a graph

In [31]:
#verify that there is no missing source/target
links.any().isna()

link_source    False
link_target    False
dtype: bool

In [32]:
G = nx.from_pandas_edgelist(links, 'link_source', 'link_target', create_using=nx.DiGraph())

In [33]:
dangling_nodes = []
for node, out_deg in G.out_degree():
    if out_deg == 0:
        dangling_nodes.append(node)
        
dangling_nodes

['Osteomalacia',
 'Local_community',
 'Directdebit',
 'Duchenne_muscular_dystrophy',
 'Klinefelter%27s_syndrome']

In [34]:
page_rank = nx.pagerank(G, alpha = 0.95)

In [35]:
keys = list(page_rank.keys())
vals = list(page_rank.values())
sorted_page_rank = {keys[i]: vals[i] for i in np.argsort(vals)[::-1]}

In [36]:
#At this step page rank is done for each page
# now we would like to consider the paths (FINISHED PATHS FOR NOW) and consider only the pages visited after the 'hub'

In [37]:
finished_paths = pd.read_csv("wikispeedia_paths-and-graph/paths_finished.tsv", sep='\t', skiprows = 17, names=['hashedIpAddress','timestamp','durationInSec','path','rating'])

In [38]:
finished_paths.head()

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
1,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
2,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
3,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0
4,5295bca242be81fe,1372890414,110,14th_century;Europe;North_America;United_State...,


In [39]:

#final version of remove_unvisited_pages
# working well enfiiin

def remove_unvisited_pages(path):
    if path.count('<') == 0:
        return path
    
    i=0
    while i<len(path):

        if path[i] == '<':
        
            counter = 0
            tmp_i = i

            while tmp_i<len(path) and path[tmp_i] == '<':
       
                tmp_i +=1
                counter+=1
            v=max(0,i-counter)

            for j in range(v, i):

                path[j] = '<'
            i = tmp_i
        else:
            i+=1
    indx = np.where(np.array(path)=='<')[0]
    path = np.delete(np.array(path), indx)
    return path

In [40]:
#test remove_unvisited_pages
l =(finished_paths['path'].iloc[40]).split(';')
print(l)           
l=remove_unvisited_pages(l)
l

['14th_century', 'Europe', '<', 'England', '<', 'Christianity', 'Anglicanism', 'United_States', 'Walt_Whitman', '<', 'Poetry_of_the_United_States', '<', '<', '<', 'French_Revolution', 'Age_of_Enlightenment', 'David_Hume', 'Philosophy', 'Fyodor_Dostoevsky', '<', '<', '<', '<', 'Constitutional_monarchy', 'Government', 'Anarchism', 'Henry_David_Thoreau']


array(['14th_century', 'Christianity', 'Anglicanism', 'United_States',
       'French_Revolution', 'Constitutional_monarchy', 'Government',
       'Anarchism', 'Henry_David_Thoreau'], dtype='<U23')

In [41]:
processed_paths = []
pages_with_no_rank = []  # those pages are missing in 'articles' ?! Should we add them to the graph ?
for i, s in finished_paths.iterrows():
    path_split=(finished_paths['path'].iloc[i]).split(';')
    path_split = remove_unvisited_pages(path_split)
    ranks = []
    for elem in path_split:
        r = page_rank.get(elem, -1)
        if r==-1:
            pages_with_no_rank.append(elem)
        ranks.append(r)

    processed_paths.append(path_split[np.argmax(ranks):])
    
pages_with_no_rank

['Wikipedia_Text_of_the_GNU_Free_Documentation_License']

In [42]:
#processed paths contains all the paths starting from the 'hub' (selected as the page with highest page rank in that path)

### Next Step
use the dataframe categories to replace each page in processed paths by each category and construct the graph using categories

In [43]:
categories = pd.read_csv("wikispeedia_paths-and-graph/categories.tsv", sep='\t', skiprows = 12, names=['article', 'category'])

In [44]:
categories.head()

Unnamed: 0,article,category
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures
2,%C3%85land,subject.Countries
3,%C3%85land,subject.Geography.European_Geography.European_...
4,%C3%89douard_Manet,subject.People.Artists


In [45]:
# the number of periods is the specifity of the category, the more specific 
# the more relevant 
def count_periods(s):
    return s.count('.')

In [46]:
# Add a column with the count of periods in the category
categories['period_count'] = categories['category'].apply(count_periods)

# Selecting the most specific category for each article
most_specific_category = categories.sort_values(by='period_count', ascending=False).drop_duplicates('article')

# Dropping the period_count column to get the final result
specific_categories = most_specific_category.drop(columns=['period_count'])

# Resetting the index for cleanliness
specific_categories.reset_index(drop=True, inplace=True)

specific_categories.head()

Unnamed: 0,article,category
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
1,European_Greenfinch,subject.Science.Biology.Birds
2,Rutherfordium,subject.Science.Chemistry.Chemical_elements
3,Eurasian_Oystercatcher,subject.Science.Biology.Birds
4,Eurasian_Jay,subject.Science.Biology.Birds


In [47]:
# making the article column the index
specific_categories = specific_categories.set_index('article')

In [48]:
specific_categories.head()

Unnamed: 0_level_0,category
article,Unnamed: 1_level_1
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
European_Greenfinch,subject.Science.Biology.Birds
Rutherfordium,subject.Science.Chemistry.Chemical_elements
Eurasian_Oystercatcher,subject.Science.Biology.Birds
Eurasian_Jay,subject.Science.Biology.Birds


In [49]:
# transforming the article paths into paths of categories  
processed_paths_serie = pd.Series(processed_paths)
category_paths = processed_paths_serie.map(lambda path: [specific_categories.loc[article, 'category'] if article in specific_categories.index else 'Unknown_Category' for article in path])

In [50]:
category_paths.head()

0    [subject.Geography.European_Geography, subject...
1    [subject.Geography.African_Geography, subject....
2    [subject.Geography.European_Geography.European...
3    [subject.Geography.European_Geography.European...
4    [subject.Geography.North_American_Geography, s...
dtype: object

In [52]:
from collections import defaultdict
# building the connection dataframe with the counter

category_pairs_counter = defaultdict(int)

# Process each path
for path in category_paths:
    for i in range(len(path) - 1):
        # Increment the counter for each found pair
        pair = (path[i], path[i + 1])
        category_pairs_counter[pair] += 1

# unique category list that appear in our paths, 
# no need for all categories of articles in  

unique_categories = list(set(cat for path in category_paths for cat in path))

# Initialize an the connections
category_connections = pd.DataFrame(index=unique_categories, columns=unique_categories).fillna(0)

# filling up the values with the counts
for (cat1, cat2), count in category_pairs_counter.items():
    category_connections.loc[cat1, cat2] = count
    
category_connections

Unnamed: 0,subject.People.Computing_People,subject.People.Producers_directors_and_media_figures,subject.People.Performers_and_composers,subject.Geography.Antarctica,subject.Everyday_life.Games,subject.Citizenship.Politics_and_government,subject.History.Recent_History,subject.Citizenship.Environment,subject.Citizenship.Law,subject.Science.Physics.The_Planets,...,subject.Art.Art,subject.People.Monarchs_of_Great_Britain,subject.Everyday_life.Sports,subject.Geography.Geography_of_the_Middle_East,subject.History.North_American_History,subject.People.Writers_and_critics,subject.Geography.Geography_of_Oceania_Australasia,subject.People.Human_Scientists,subject.Everyday_life.Sports_events,subject.Geography.European_Geography
subject.People.Computing_People,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
subject.People.Producers_directors_and_media_figures,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
subject.People.Performers_and_composers,0,0,72,0,0,0,0,0,0,1,...,1,0,1,0,0,1,0,0,0,3
subject.Geography.Antarctica,0,0,0,25,0,0,0,0,0,3,...,0,0,0,0,0,0,1,0,0,2
subject.Everyday_life.Games,0,0,0,0,188,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
subject.People.Writers_and_critics,0,1,2,0,0,4,0,0,0,0,...,2,0,0,0,0,44,1,0,0,3
subject.Geography.Geography_of_Oceania_Australasia,0,0,3,11,0,14,1,19,1,0,...,0,0,42,0,0,0,352,0,19,1
subject.People.Human_Scientists,0,0,0,0,0,1,0,3,0,0,...,1,0,0,0,0,2,1,19,0,0
subject.Everyday_life.Sports_events,0,0,0,0,0,0,0,0,0,0,...,0,0,121,0,0,0,19,0,38,2
