# Import Packages

In [17]:
import glob
import pandas as pd
from tqdm import tqdm

from transformers import BertTokenizer, BertModel
import torch
from nltk.tokenize import RegexpTokenizer
import re
import networkx as nx
from scipy.stats import kendalltau
import numpy as np
import torch

# Processing Data

In [None]:
txts = glob.glob("C:/Users/yiwei/Downloads/wiki_data/*.txt")
titles = []
texts = []
for val in tqdm(txts):
    with open(val) as f:
        text = f.read()
        title = text[text.find('>') + 1:text.find('<', 2)]
        texts.append(text)
        titles.append(title)

100%|██████████| 40001/40001 [01:12<00:00, 552.31it/s]  


In [27]:
corpus = pd.DataFrame({'title': titles, 'text': texts})

In [36]:
corpus.head()

Unnamed: 0,title,text
0,April,<title>April</title><text>{{monththisyear|4}} ...
1,August,<title>August</title><text>{{monththisyear|8}}...
2,Andouille,<title>Andouille</title><text>[[File:Andouille...
3,Calculus,<title>Calculus</title><text>{{More citations ...
4,Liter,<title>Liter</title><text>#REDIRECT [[Litre]] ...


In [34]:
corpus.to_csv('wiki_data.csv', index=False)

In [4]:
corpus = pd.read_csv('wiki_data.csv')

In [5]:
# because dataset is too big to handle with our resources (limitations for semantic graph), we subsample the data
subset_corpus = corpus.sample(frac=0.05, random_state=42)
subset_corpus.reset_index(drop=True, inplace=True)

## Generating Semantic Graph from Dataset

In [6]:
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [7]:
# removing punctuation
tokenizer = RegexpTokenizer(r'\w+')
to_tokenize = []
for sentence in tqdm(subset_corpus['text']):
    to_tokenize.append(' '.join(tokenizer.tokenize(cleanhtml(sentence))))


100%|██████████| 2000/2000 [00:00<00:00, 16808.14it/s]


In [8]:
def batch_iterator(data, batch_size = 100):
    for i in range(0, len(data), batch_size):
        yield data[i:i+batch_size]

In [9]:
device = 'cuda'
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased').to(device)

In [10]:
encodings_lst = []
for batch in tqdm(batch_iterator(to_tokenize)):
    encodings = tokenizer.batch_encode_plus(batch, truncation=True, padding=True, return_tensors='pt', add_special_tokens=True)
    encodings = encodings.to(device)
    encodings_lst.append(encodings)


0it [00:00, ?it/s]

20it [00:05,  3.54it/s]


In [11]:
word_embeddings_lst = []
for encodings in tqdm(encodings_lst):
    with torch.no_grad():
        outputs = model(encodings['input_ids'], attention_mask=encodings['attention_mask'])
        word_embeddings = outputs.last_hidden_state 
        word_embeddings_lst.append(word_embeddings)
    del encodings

100%|██████████| 20/20 [00:32<00:00,  1.62s/it]


In [12]:
all_word_embeddings = torch.cat(word_embeddings_lst, dim=0)
del word_embeddings_lst
corpus_semantics = all_word_embeddings.mean(dim=1)
del all_word_embeddings


In [13]:
# calculate cosine similarity of tensors
cosine_similarities = corpus_semantics / corpus_semantics.norm(p=2, dim=1, keepdim=True)
cosine_similarities = torch.mm(cosine_similarities, cosine_similarities.T)

In [14]:
G = nx.Graph()
titles = subset_corpus['title'].str.lower()
G.add_nodes_from(subset_corpus['title'].str.lower())
for i in tqdm(range(corpus_semantics.shape[0])):
    for j in range(i + 1, corpus_semantics.shape[0]):
        similarity = cosine_similarities[i, j].item()
        if not G.has_edge(titles[i], titles[j]):
            G.add_edge(titles[i], titles[j], weight=similarity)
        else:
            if similarity > G.get_edge_data(titles[i], titles[j])['weight']:
                G.remove_edge(titles[i], titles[j])
                G.add_edge(titles[i], titles[j], weight=similarity)

100%|██████████| 2000/2000 [01:19<00:00, 25.12it/s] 


## Generating PageRank Graph from Dataset

In [170]:
# first graph generate under pretext of a "closed environment"
PRGC = nx.DiGraph()
pattern = r'\[\[(.*?)\]\]'
titles = subset_corpus['title'].str.lower()
PRGC.add_nodes_from(titles)
for i in tqdm(range(len(subset_corpus['title']))):
    txt = subset_corpus['text'][i]
    references = re.findall(pattern, txt)
    for reference in references:
        if len(titles[titles == reference]) > 0:
            PRGC.add_edge(titles[i], reference.lower())

100%|██████████| 2000/2000 [00:10<00:00, 197.16it/s]


# Using Semantic Graph for PageRank

In [132]:
# using PageRank
pr_semantic = nx.pagerank(G, weight='weight')

In [133]:
pr_semantic

{'iliad': 0.0004900985084144861,
 'ilham aliyev': 0.0005263368209468375,
 'category:1943 births': 0.000507806672514236,
 'category:history of russia': 0.0005081274495413878,
 'kathleen blanco': 0.0004920031745096513,
 'category:1798 births': 0.000502829891842668,
 'aetyonyx': 0.00048023498519902345,
 'category:1894 deaths': 0.0004969179770956567,
 'template:rfa': 0.0005021665964478898,
 'becker (tv series)': 0.0004750344220907833,
 'wrestlemania (ppv series)': 0.0004836620797585116,
 'template:user movies': 0.0005015446875844149,
 'the adventures of huckleberry finn': 0.0004968491443796292,
 'weapons': 0.0005047282009835525,
 'ravi shanker': 0.0004757608884296069,
 'category:1152': 0.0004933820613314319,
 'the empire strikes back': 0.0004955483781605544,
 'telephone call': 0.0004871465689236712,
 'category:french cyclists': 0.0004989927173033412,
 'lyle lovett': 0.00046118700543973314,
 'the church of scientology': 0.0004978630700174703,
 'stanford university': 0.0004824524186494746,
 

# Using Normal Dataset for PageRank

In [177]:
pr = nx.pagerank(PRGC)

In [178]:
pr

{'iliad': 0.0004077600251460045,
 'ilham aliyev': 0.0004077600251460045,
 'category:1943 births': 0.0004077600251460045,
 'category:history of russia': 0.0004077600251460045,
 'kathleen blanco': 0.0004077600251460045,
 'category:1798 births': 0.0004077600251460045,
 'aetyonyx': 0.0004077600251460045,
 'category:1894 deaths': 0.0004077600251460045,
 'template:rfa': 0.0004077600251460045,
 'becker (tv series)': 0.0004077600251460045,
 'wrestlemania (ppv series)': 0.0004077600251460045,
 'template:user movies': 0.0004077600251460045,
 'the adventures of huckleberry finn': 0.0004077600251460045,
 'weapons': 0.00075467217355405,
 'ravi shanker': 0.0004077600251460045,
 'category:1152': 0.0004077600251460045,
 'the empire strikes back': 0.0004077600251460045,
 'telephone call': 0.0004077600251460045,
 'category:french cyclists': 0.0004077600251460045,
 'lyle lovett': 0.0004077600251460045,
 'the church of scientology': 0.0004077600251460045,
 'stanford university': 0.0004077600251460045,
 'b

# Analysis Between Semantic Graph vs Normal Dataset for PageRank 

In [179]:
def sort_dict(input):
    keys = list(input.keys())
    values = list(input.values())
    sorted_value_idx = np.argsort(values)
    return [keys[i] for i in sorted_value_idx[::-1]]

In [180]:
pr_sorted = sort_dict(pr)
pr_semantic_sorted = sort_dict(pr_semantic)

In [181]:
kendalltau(pr_sorted, pr_semantic_sorted)

SignificanceResult(statistic=-0.0150590810920976, pvalue=0.3129468826698215)

In [182]:
pr_semantic_sorted[:10]

['boredom',
 'wendy carlos',
 'malcolm mcdowell',
 'piri reis',
 'sundial',
 'kurw',
 'peer review',
 'puerta de alcal',
 'hydraulics',
 'wham!']

In [183]:
pr_sorted[:10]

['geometry',
 'cone',
 'science',
 'hero',
 'protagonist',
 'clothing',
 '1194',
 'bird',
 '1250',
 'chemistry']