# a notebook to visualize the relationship between the wiki articles.

In [1]:
import networkx as nx

In [2]:
import pandas as pd 


import matplotlib.pyplot as plt
import numpy as np


from sklearn.feature_extraction import text 

import nltk
nltk.download('punkt')


from sklearn.decomposition import LatentDirichletAllocation as LDA


from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

count_vectorizer = text.CountVectorizer(tokenizer=LemmaTokenizer(),
                                strip_accents = 'unicode', # works 
                                stop_words = 'english', # works
                                lowercase = True, # works
                                max_df = 0.5, # works
                                min_df = 10) # works

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
wiki_phys = pd.read_csv("../nlp_clean/wiki_phys_cleaned.csv")

In [4]:
wiki_phys.drop_duplicates("Name",inplace=True)

In [5]:
wiki_phys.shape

(1479, 6)

In [6]:
data_count_vectorized=count_vectorizer.fit_transform(wiki_phys['Text'].values.astype('U'))

lda_model_count=LDA(n_components=26
              ,max_iter=100,learning_method='batch').fit(data_count_vectorized)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

def top_n_index_sorted(array, top_n):
    # get the index of top 10 values in the array
    index = np.argpartition(array,-top_n)[-top_n:]
    
    # return the index of the sorted top_n values 
    return index[np.argsort(array[index])]

def top10_recommend_index(index,model,data,top_n):
    # transform the data using the model
    lda_vectors=model.transform(data)
    # calculate the cos similarity from the lda vectors 
    similarity=cos_sim(lda_vectors)
    
    top_n_index=top_n_index_sorted(similarity[index,],top_n+1)
    
    # get the decreasing similarity index
    return top_n_index[::-1]  

In [8]:
# top10_recommend_1 = top10_recommend_index(1,lda_model_count,data_count_vectorized,10)

In [39]:
# a function to generate edge list
def add_edges_from_recommendation(index,model,data,text_vectors):
    top10_recommended = top10_recommend_index(index,model,text_vectors,10)
    
    edge_list = list()
    original_article = data['Name'].iloc[index]
    for recommend_index in top10_recommended[1:]:
        node_name= data['Name'].iloc[recommend_index]
        edge_list.append((original_article,node_name))
    
    return edge_list

In [42]:
add_edges_from_recommendation(0,lda_model_count,wiki_phys,data_count_vectorized)

[('Wikipedia:FAQ/Categorization', 'List of physics journals'),
 ('Wikipedia:FAQ/Categorization', 'Category:Physics stubs'),
 ('Wikipedia:FAQ/Categorization', 'List of Slovenian physicists'),
 ('Wikipedia:FAQ/Categorization', 'Template:Physics-stub'),
 ('Wikipedia:FAQ/Categorization', 'Template:Acoustics-stub'),
 ('Wikipedia:FAQ/Categorization', 'Template:Biophysics-stub'),
 ('Wikipedia:FAQ/Categorization',
  'International Association of Physics Students'),
 ('Wikipedia:FAQ/Categorization',
  "Ukrainian Physicists' Tournament for University Students"),
 ('Wikipedia:FAQ/Categorization', 'CESRA'),
 ('Wikipedia:FAQ/Categorization',
  'Physics Instructional Resource Association')]

In [None]:
# for the progress bar
from tqdm import tqdm
wiki_graph = nx.Graph()

for index in tqdm(range(wiki_phys.shape[0])):
    wiki_graph.add_edges_from(add_edges_from_recommendation(index,lda_model_count,wiki_phys,data_count_vectorized))

plt.figure(figsize = (20,20))
nx.draw_networkx(wiki_graph)

 78%|███████▊  | 1150/1479 [13:55<04:10,  1.31it/s]