# Import Packages

In [8]:
import glob
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
import re
import networkx as nx
from scipy.stats import kendalltau
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import gensim
import joblib


# Processing Data

In [None]:
txts = glob.glob("C:/Users/yiwei/Downloads/wiki_data/*.txt")
titles = []
texts = []
for val in tqdm(txts):
    with open(val) as f:
        text = f.read()
        title = text[text.find('>') + 1:text.find('<', 2)]
        texts.append(text)
        titles.append(title)

100%|██████████| 40001/40001 [01:12<00:00, 552.31it/s]  


In [27]:
corpus = pd.DataFrame({'title': titles, 'text': texts})

In [36]:
corpus.head()

Unnamed: 0,title,text
0,April,<title>April</title><text>{{monththisyear|4}} ...
1,August,<title>August</title><text>{{monththisyear|8}}...
2,Andouille,<title>Andouille</title><text>[[File:Andouille...
3,Calculus,<title>Calculus</title><text>{{More citations ...
4,Liter,<title>Liter</title><text>#REDIRECT [[Litre]] ...


In [34]:
corpus.to_csv('wiki_data.csv', index=False)

In [3]:
corpus = pd.read_csv('wiki_data.csv')

In [4]:
# because dataset is too big to handle with our resources (limitations for semantic graph), we subsample the data
# subset_corpus = corpus.sample(frac=0.25, random_state=42)
# subset_corpus.reset_index(drop=True, inplace=True)

## Generating Semantic Graph from Dataset

In [5]:
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [13]:
# removing punctuation
tokenizer = RegexpTokenizer(r'\w+')
to_tokenize = []
for sentence in tqdm(corpus['text']):
    to_tokenize.append(' '.join(tokenizer.tokenize(cleanhtml(sentence))))


100%|██████████| 40001/40001 [00:03<00:00, 12688.81it/s]


In [14]:
idx_title_mapping = {idx: str(corpus['title'][idx]).lower() for idx in range(len(to_tokenize))}

In [11]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [15]:
def get_text_embedding(tokens, model):
    word_vectors = []
    for word in tokens:
        if word in model.key_to_index:  # Check if word is in the model's vocabulary
            word_vectors.append(model[word])
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # Return zero vector if no word vectors

# Compute the embeddings for each document
text_embeddings = np.array([get_text_embedding(tokens, model) for tokens in tqdm(to_tokenize)])

# Calculate the cosine similarity matrix between the embeddings
cos_sim_matrix = cosine_similarity(text_embeddings)

  0%|          | 0/40001 [00:00<?, ?it/s]

100%|██████████| 40001/40001 [01:09<00:00, 576.52it/s] 


In [18]:
joblib.dump(cos_sim_matrix, 'cos_sim_matrix_word2vec.pkl')

['cos_sim_matrix_word2vec.pkl']

In [17]:

# Threshold for considering an edge (e.g., similarity > 0.1)
threshold = 0.1

# Create the graph without adding too many edges
SG = nx.Graph()

for i in tqdm(range(len(to_tokenize))):
    SG.add_node(i, content=(to_tokenize[i]))  # Add article as a node
    
    for j in range(i+1, len(to_tokenize)):
        similarity = cos_sim_matrix[i, j]
        
        if similarity > threshold:
            SG.add_edge(i, j, weight=similarity)

# After this, G will only contain edges with similarity above the threshold


  3%|▎         | 1314/40001 [01:57<57:34, 11.20it/s]  


MemoryError: 

In [None]:
PR = nx.DiGraph()
pattern = r'\[\[(.*?)\]\]'
titles = corpus['title'].str.lower()
PR.add_nodes_from(titles)
for i in tqdm(range(len(corpus['title']))):
    txt = corpus['text'][i]
    references = re.findall(pattern, txt)
    for reference in references:
        if len(titles[titles == reference]) > 0:
            PR.add_edge(titles[i], reference.lower())

# OLD CODE

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import numpy as np

# Compute the TF-IDF matrix (sparse)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(to_tokenize)  # articles is your list of article texts

# Calculate cosine similarity matrix (only for pairs with a non-zero similarity)
cos_sim_matrix = cosine_similarity(tfidf_matrix)
joblib.dump(cos_sim_matrix, 'cos_sim_matrix.pkl')


['cos_sim_matrix.pkl']

In [8]:

# Threshold for considering an edge (e.g., similarity > 0.1)
threshold = 0.1

# Create the graph without adding too many edges
SG = nx.Graph()

for i in tqdm(range(len(to_tokenize))):
    SG.add_node(i, content=(to_tokenize[i]))  # Add article as a node
    
    for j in range(i+1, len(to_tokenize)):
        similarity = cos_sim_matrix[i, j]
        
        if similarity > threshold:
            SG.add_edge(i, j, weight=similarity)

# After this, G will only contain edges with similarity above the threshold


100%|██████████| 10000/10000 [00:12<00:00, 816.42it/s]


## Generating PageRank Graph from Dataset

In [11]:
# first graph generate under pretext of a "closed environment"
PRG = nx.DiGraph()
pattern = r'\[\[(.*?)\]\]'
titles = corpus['title'].str.lower()
PRG.add_nodes_from(titles)
for i in tqdm(range(len(corpus['title']))):
    txt = corpus['text'][i]
    references = re.findall(pattern, txt)
    for reference in references:
        if len(titles[titles == reference]) > 0:
            PRG.add_edge(titles[i], reference.lower())

100%|██████████| 10000/10000 [02:29<00:00, 66.73it/s]


# Using Semantic Graph for PageRank

In [27]:
# using PageRank
pr_semantic = nx.pagerank(SG, weight='weight')
pr_semantic = {idx_title_mapping[idx]: pr_semantic[idx] for idx in pr_semantic}


In [28]:
pr_semantic

{'iliad': 0.00031967378294590996,
 'natural history': 0.0003456240121572913,
 '1955': 0.00045061863185688784,
 'bismarck (ship)': 0.00025148840656744384,
 'ohio river': 9.451045898665043e-05,
 'chief executive officer': 0.00023595867199984274,
 'grass': 0.0001627866145495946,
 'sergei prokofiev': 0.0006131843035978945,
 'hajj': 0.0003835970223377861,
 'balfour declaration of 1917': 0.00015271642221831403,
 '1258': 0.00038659879413410184,
 'chin': 0.00023564875791872299,
 'main': 0.00039480411219003906,
 'developmental biology': 0.0002462628963525054,
 'eagle': 0.0001260798149650851,
 'nia': 9.52026753173541e-05,
 'benzene': 0.00011253710870711277,
 '1918': 0.0002837818602838858,
 'amaterasu': 0.00023931685076485862,
 '50': 0.00011767090300168684,
 'arturo toscanini': 0.00043106114035413906,
 'castle': 0.00030588189562339764,
 'woodwind instrument': 0.00041191829985669974,
 'foot': 0.0002084280854242049,
 'universe': 0.0007179868229040932,
 'yankee stadium (1923)': 0.0002137540782399088

# Using Normal Dataset for PageRank

In [29]:
pr = nx.pagerank(PRGC)

In [30]:
pr

{'iliad': 5.281689695589627e-05,
 'ilham aliyev': 5.281689695589627e-05,
 'category:1943 births': 5.281689695589627e-05,
 'category:history of russia': 5.281689695589627e-05,
 'kathleen blanco': 5.281689695589627e-05,
 'category:1798 births': 5.281689695589627e-05,
 'aetyonyx': 5.281689695589627e-05,
 'category:1894 deaths': 5.281689695589627e-05,
 'template:rfa': 5.281689695589627e-05,
 'becker (tv series)': 5.281689695589627e-05,
 'wrestlemania (ppv series)': 5.281689695589627e-05,
 'template:user movies': 5.281689695589627e-05,
 'the adventures of huckleberry finn': 5.281689695589627e-05,
 'weapons': 0.0001528235584903303,
 'ravi shanker': 5.281689695589627e-05,
 'category:1152': 5.281689695589627e-05,
 'the empire strikes back': 5.281689695589627e-05,
 'telephone call': 5.347920663654016e-05,
 'category:french cyclists': 5.281689695589627e-05,
 'lyle lovett': 5.281689695589627e-05,
 'the church of scientology': 5.281689695589627e-05,
 'stanford university': 5.281689695589627e-05,
 

# Analysis Between Semantic Graph vs Normal Dataset for PageRank 

In [31]:
def sort_dict(input):
    keys = list(input.keys())
    values = list(input.values())
    sorted_value_idx = np.argsort(values)
    return [keys[i] for i in sorted_value_idx[::-1]]

In [32]:
pr_sorted = sort_dict(pr)
pr_semantic_sorted = sort_dict(pr_semantic)

In [33]:
kendalltau(pr_sorted, pr_semantic_sorted)

SignificanceResult(statistic=-0.00866888742963993, pvalue=0.19457062578100548)

In [34]:
pr_semantic_sorted[:10]

['wikipedia:manual of style',
 'australia',
 'arctic',
 'manhattan',
 'city',
 'racial segregation',
 'philippines',
 'history of japan',
 "st. peter's basilica",
 'albert einstein']

In [35]:
pr_sorted[:10]

['movie',
 'city',
 'language',
 'country',
 'newspaper',
 'sound',
 'news',
 'oxygen',
 'mammal',
 'mathematics']