In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from urllib.parse import quote, unquote
from bs4 import BeautifulSoup
from collections import Counter
import seaborn as sns
from statistics import mean
from matplotlib.colors import LogNorm
import networkx as nx

In [93]:
# Load list of dataframes containing links

path_list = []
for i in range(9, 23):
    path_list.append('Wiki_Revs/links' + str(i) + '.tsv')


year_list = []
for i in range(2009, 2023):
    year_list.append(i)

links_dict = {}
for path, year in zip(path_list, year_list):
    links_dict[year] = (pd.read_csv(path, comment='#', delimiter='\t'))

for df in links_dict.values():
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)

In [94]:
links_dict[2009].head()

Unnamed: 0,linkSource,linkTarget
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Bede
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba
2,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata
3,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain
4,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland


In [95]:

def create_graph_from_links(links):
    graph = nx.DiGraph()
    for l, r in links.iterrows():
        graph.add_weighted_edges_from([(r['linkSource'], r['linkTarget'], 1)])
    return graph

def calculate_page_rank(graph):
    graph_page_rank = nx.pagerank(graph)
    graph_page_rank_rank = {k:i+1 for i, (k, v) in enumerate(sorted(graph_page_rank.items(), key=lambda item: item[1], reverse=True))}
    graph_page_rank = {k:graph_page_rank[k] for k in graph_page_rank}
    return graph_page_rank, graph_page_rank_rank

In [96]:
graphs = {}
for year, links in links_dict.items():
    graphs[year] = create_graph_from_links(links)

page_ranks = {}
page_rank_ranks = {}
for year, graph in graphs.items():
    page_ranks[year], page_rank_ranks[year] = calculate_page_rank(graph)

In [119]:
def get_degrees(graph):
    in_degrees = graph.in_degree()
    out_degrees = graph.out_degree()
    return dict(in_degrees), dict(out_degrees)

outlinks = {}
inlinks = {}
for year, graph in graphs.items():
    inlinks[year], outlinks[year] = get_degrees(graph)

In [120]:
def shifters(year, prr, rev=True):
    cur_year = year
    next_year = year + 1
    diff_tuple = ((art, (prr[cur_year][art] - prr[next_year][art])) for art in prr[cur_year].keys() if art in prr[next_year].keys())
    diff_tupsort = sorted(diff_tuple, key=lambda x: x[1], reverse=rev)
    return diff_tupsort

In [147]:
res = shifters(2020, inlinks)
print(res[:10])

[('Bah%C3%A1%27%C3%AD_Faith', 62), ('Industry', 50), ('Kiev', 42), ('Polish-Lithuanian_Commonwealth', 16), ('Earth', 12), ('China', 11), ('European_Parliament', 11), ('Judaism', 11), ('Piano', 10), ('Accra', 10)]


In [148]:
# The main categories that we are interested in (we have chosen these ourselves)
wiki_2009_categories = pd.read_csv('./wikispeedia_paths-and-graph/categories.tsv', delim_whitespace=True, names=['article', 'category'], comment='#')

# When articles have more than one main category, we choose to sample one.
# This is perhaps not an optimal method, but it is currently the best we can think of.
wiki_2009_categories = wiki_2009_categories.groupby('article').sample(1)
wiki_2009_categories['category'] = wiki_2009_categories['category'].str.extract(r'subject\.([a-zA-Z]*)')
wiki_2009_categories = wiki_2009_categories.set_index('article')
article_cat = wiki_2009_categories.to_dict()['category']

In [150]:
shifts2 = []
for year in range(2010, 2022):
    ins = shifters(year, inlinks)[:5]
    outs = shifters(year, outlinks)[:5]
    for i in range(5):
        shifts2.append([year, unquote(ins[i][0]), ins[i][1], unquote(outs[i][0]), outs[i][1]])
shifts2df = pd.DataFrame(shifts2, columns=['year', 'Inlink Page', 'Increase inlinks', 'Outlink Page', 'Increase outlinks'])
shifts2df

Unnamed: 0,year,Inlink Page,Increase inlinks,Outlink Page,Increase outlinks
0,2010,United_States,106,Nicaragua,152
1,2010,Europe,86,Ghana,150
2,2010,England,78,British_monarchy,112
3,2010,World_War_II,74,Costa_Rica,95
4,2010,France,66,Istanbul,87
5,2011,United_States,111,Driving_on_the_left_or_right,254
6,2011,England,89,India,136
7,2011,France,88,"Los_Angeles,_California",112
8,2011,Europe,84,United_States,92
9,2011,United_Kingdom,68,Canada,91


In [151]:
shifts = []
freq_cat = dict()
freq_cat_down = dict()
for year in range(2010, 2022):
    up_5 = shifters(year, page_rank_ranks, True)[:5]
    down_5 = shifters(year, page_rank_ranks, False)[:5]
    for i in range(5):
        shifts.append([year, unquote(up_5[i][0]), up_5[i][1], unquote(down_5[i][0]), down_5[i][1]])
        cat = article_cat[up_5[i][0]]
        if down_5[i][0] in article_cat:
            catd = article_cat[down_5[i][0]]
            if catd in freq_cat_down:
                freq_cat_down[catd] += 1
            else:
                freq_cat_down[catd] = 1
        if cat in freq_cat:
            freq_cat[cat] += 1
        else:
            freq_cat[cat] = 1
shiftsdf = pd.DataFrame(shifts, columns=['year', 'up_1', 'up_1_val', 'down_1', 'down_1_val'])
shiftsdf
# print(shifters(2021, page_rank_ranks, False)[:5])

Unnamed: 0,year,up_1,up_1_val,down_1,down_1_val
0,2010,Spring_Heeled_Jack,3246,Belton_House,-2966
1,2010,Whitethroat,3199,Nigella,-2832
2,2010,Storm_of_October_1804,3160,Donation,-2590
3,2010,Rebecca_Helferich_Clarke,3136,Felice_Beato,-2509
4,2010,Pochard,3134,Wigeon,-2496
5,2011,Project_MKULTRA,3237,Conflict,-2819
6,2011,Short-beaked_Echidna,3179,Kookaburra,-2396
7,2011,Shrimp_farm,3093,Gardening,-2293
8,2011,Scent_of_a_Woman,3092,Menthol,-2263
9,2011,Roan_Antelope,3086,Sumo,-2140


In [100]:
print(freq_cat)
print(freq_cat_down)

{'Religion': 4, 'Science': 20, 'Geography': 6, 'People': 4, 'History': 12, 'Everyday': 3, 'IT': 1, 'Design': 6, 'Language': 1, 'Business': 1, 'Citizenship': 2}
{'Design': 5, 'Science': 13, 'People': 5, 'Citizenship': 5, 'Everyday': 6, 'History': 8, 'IT': 1, 'Business': 3, 'Geography': 6, 'Mathematics': 1, 'Language': 4, 'Art': 1, 'Music': 1}


In [101]:
print(article_cat['Defaka'])

Geography


In [175]:
prr = page_rank_ranks
cur_year = 2020
next_year = 2021
diff_tuple = ([art, (prr[cur_year][art] - prr[next_year][art])] for art in prr[cur_year].keys() if art in prr[next_year].keys())
diff_tupsort = sorted(diff_tuple, key=lambda x: x[1], reverse=True)
diff_tupsort = [(unquote(x[0]), x[1]) for x in diff_tupsort]
print(diff_tupsort[:20])

[('Effect_of_Hurricane_Katrina_on_New_Orleans', 2985), ('Sardar_Vallabhbhai_Patel', 2968), ('Shelduck', 2952), ('Trinity_test', 2950), ('Allegory_in_the_Middle_Ages', 2912), ('David_Heymann', 2909), ('London_sewerage_system', 2896), ('Kakapo', 2791), ('Sputnik_program', 2756), ('Saint_Lawrence_Seaway', 2692), ('National_parks_of_England_and_Wales', 2553), ('First_Transcontinental_Railroad', 2398), ('Margin_of_error', 2349), ('RER', 2234), ('Battle_of_Hastings', 2195), ('Persian_Empire', 2174), ('Eliminative_materialism', 2056), ('Boa', 1989), ('Newmarket', 1880), ('Sea_level_rise', 979)]


In [176]:
inl = inlinks
cur_year = 2010
next_year = 2022
diff_tuple = ([art, (inl[cur_year][art] - inl[next_year][art])] for art in inl[cur_year].keys() if art in inl[next_year].keys())
diff_tupsort = sorted(diff_tuple, key=lambda x: x[1], reverse=True)
diff_tupsort = [(unquote(x[0]), x[1]) for x in diff_tupsort]
print(diff_tupsort[:5])

[('United_States', 513), ('Europe', 358), ('United_Kingdom', 303), ('France', 291), ('England', 271)]
