Let's try and see clusters of Categories among the wikipedia articles using the categorylinks.


In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.manifold import Isomap

In [2]:
# Open the file from the repository

with open("cl_epa.txt", "r") as f:
    data=json.load(f)
len(data)

7524

In [3]:
# I need an array of [[1,2,3... ],[0,3,5...], ...] 
# so I prepare a vocabulary to then use the vectorizer

# Create the all-encompassing vocabulary
import re

def GetVocabList(data_dict):
    #data_dict is a dictionary {keys}:["str1", "str2",..., "strN"]
    #this functions yields ["str1", "str2", ... "strM"], where M is the sum of N over all Keys
    list_categories = []
    cl_index=[]
    for key in data_dict: 
        cl_index.append(key)
        links=[]
        for n in range(len(data_dict[key])):
        
            # Remove the following regular expressions
            
            re0="protected_page"
            #re1="with_accessdate"
            #re2=".rticle"
            #re3=".ikipedia"
            #re4="to_be_expanded"
            #re5="with_unsourced_statements"
            #re6="needing_additional_references"
            #re7="lacking_sources"
            #re8="containing_potentially_dated_statements"
            #re9="with_dead_external_links"
            re1="Certification_Table_Entry_usages_for"
            re2="language_sources"
            re3="Articles_containing"
            
            if not any(re.search(regex, data_dict[key][n]) for regex in [re0, re1, re2, re3]):
                links.append(data_dict[key][n])
        links=" ".join(links)
        list_categories.append(links)
                
    return list_categories, cl_index

# list_categories looks like ["str1 strN", ..., "... str137593"]]. A list strings, each is " ".join(links_in_article)

In [4]:
# Get clean list of links
clean_categories, epa_id = GetVocabList(data)

print(len(clean_categories))
print(len(epa_id))
print(epa_id[3200])
print(data[epa_id[3200]])
print(clean_categories[3200])   #it's a list of strings

7524
7524
166706
['Articles_containing_Greek-language_text', 'Greek_letters', 'Phonetic_transcription_symbols', 'Wikipedia_indefinitely_semi-protected_pages']
Greek_letters Phonetic_transcription_symbols


In [5]:
# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_features = vectorizer.fit_transform(clean_categories)

In [83]:
print(type(train_features))
print(train_features.shape)

# Numpy arrays are easy to work with, so convert the result to an 
# array  (instead of a matrix)
#train_features_array = train_features.toarray()
#print(type(train_features_array))
#print(train_features_array.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(7524, 5000)


In [117]:
# there are 137 articles without links in the vectorizer!
j=0

for i in range(7524):
    if train_features[i].sum(1)<1:
        j=j+1
print("total", j)

total 137


In [119]:
tf_norma=np.sqrt(train_features.multiply(train_features).sum(1))
tf_normalized=train_features.multiply(1/train_features.multiply(train_features).sum(1))
tf_check=tf_normalized.sum(1)

j=0

for i in range(7524):
    if tf_normalized[i].sum(1)<1:
        j=j+1
print("total", j)

total 2401


In [50]:
# compute the norm of vectors

from numpy import linalg as LA

tf_norm=[]
for i in range(len(train_features_array)):
    norma=LA.norm(train_features_array[i]) #calculates norms of row vectors
    tf_norm.append(norma)
print(len(tf_norm))

7524


In [58]:
# check normalization code

print(train_features_array[1])
print(tf_norm[1])
print(train_features_array[1]/tf_norm[1])
print(LA.norm(train_features_array[1]/tf_norm[1]))

[0 0 0 ..., 0 0 0]
4.12310562562
[ 0.  0.  0. ...,  0.  0.  0.]
1.0


In [60]:
# normalize all vectors

tf_normalized = []
for i in range(len(train_features_array)):
    vector=train_features_array[i] / tf_norm[i]
    tf_normalized.append(vector)

In [65]:
LA.norm(tf_normalized[1000]) #vectors are actually normalized

1.0

In [75]:
np.dot(tf_normalized[10], tf_normalized[100]) # scalar product

0.074535599249992979

In [76]:
distances = 180 * np.arccos(np.clip(np.dot(tf_normalized,tf_normalized),-1.0, 1.0)) / np.pi #calculates the angle
#in degrees between each pair of articles

ValueError: shapes (7524,5000) and (7524,5000) not aligned: 5000 (dim 1) != 7524 (dim 0)

In [55]:

epa_article_distances = pd.DataFrame(distances, index=epa_entries_with_words, columns=epa_entries_with_words)

TypeError: list indices must be integers or slices, not NoneType

In [None]:
# I need to feed the Isomap an array made of a list of lists (with the vectorized values)!

iso = Isomap(n_components=2)
data_projected = iso.fit_transform(train_features_array)

In [None]:
data_projected[:,1]

In [None]:
%matplotlib inline

import matplotlib
import pylab as plt

plt.scatter(data_projected[:, 0], data_projected[:, 1], c="blue",
            edgecolor='none', alpha=0.5);   #, cmap=plt.cm.get_cmap('nipy_spectral', 10)
# plt.colorbar(label='digit label', ticks=range(10))
plt.clim(-0.5, 9.5)
plt.show()

After removing
"Certification_Table_Entry_usages_for"
"language_sources"
"Articles_containing"
I start to see more texture in the clustering.

In [None]:
for i in range(len(data_projected)):
    if data_projected[i,0]<-8 and data_projected[i,1]<-18:
        print(epa_id[i])
        print(data[str(epa_id[i])])