Let's try and see clusters of Categories among the wikipedia articles using the categorylinks.


In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.manifold import Isomap

In [2]:
# Open the file from the repository

with open("cl_epa.txt", "r") as f:
    data=json.load(f)
len(data)

7524

In [3]:
# I need an array of [[1,2,3... ],[0,3,5...], ...] 
# so I prepare a vocabulary to then use the vectorizer

# Create the all-encompassing vocabulary
import re

def GetVocabList(data_dict):
    #data_dict is a dictionary {keys}:["str1", "str2",..., "strN"]
    #this functions yields ["str1", "str2", ... "strM"], where M is the sum of N over all Keys
    list_categories = []
    for key in data_dict:  
        links=[]
        for n in range(len(data_dict[key])):
        
            # Remove the following regular expressions
            
            re0="protected_page"
            #re1="with_accessdate"
            #re2=".rticle"
            #re3=".ikipedia"
            #re4="to_be_expanded"
            #re5="with_unsourced_statements"
            #re6="needing_additional_references"
            #re7="lacking_sources"
            #re8="containing_potentially_dated_statements"
            #re9="with_dead_external_links"
            
            if not any(re.search(regex, data_dict[key][n]) for regex in [re0]):
                links.append(data_dict[key][n])
        links=" ".join(links)
        list_categories.append(links)
                
    return list_categories

# list_categories looks like ["str1 strN", ..., "... str137593"]]. A list strings, each is " ".join(links_in_article)

In [8]:
# Get clean list of links
clean_categories=GetVocabList(data)

print(len(clean_categories))
clean_categories[3200]   #it's a list of strings

7524


"1983_births 21st-century_English_singers All_articles_with_dead_external_links Articles_with_dead_external_links_from_March_2015 Articles_with_hAudio_microformats Articles_with_hCards CS1_Italian-language_sources_(it) CS1_errors:_external_links Commons_category_with_local_link_same_as_on_Wikidata English_mezzo-sopranos English_people_convicted_of_assault English_rhythm_and_blues_singers English_television_personalities EngvarB_from_March_2016 Fascination_Records_artists Footballers'_wives_and_girlfriends Girls_Aloud_members Labour_Party_(UK)_people Living_people Musicians_from_Newcastle_upon_Tyne Official_website_different_in_Wikidata_and_Wikipedia People_convicted_of_assault_occasioning_actual_bodily_harm Popstars_winners Reality_television_judges The_X_Factor_(TV_series)_judges The_X_Factor_(U.S._TV_series) The_X_Factor_(UK_TV_series) Use_dmy_dates_from_March_2016 Wikipedia_articles_with_BNF_identifiers Wikipedia_articles_with_GND_identifiers Wikipedia_articles_with_ISNI_identifiers

In [9]:
# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_features = vectorizer.fit_transform(clean_categories)

In [11]:
print(type(train_features))
print(train_features.shape)

# Numpy arrays are easy to work with, so convert the result to an 
# array  (instead of a matrix)
train_features_array = train_features.toarray()
print(type(train_features_array))
print(train_features_array.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(7524, 5000)
<class 'numpy.ndarray'>
(7524, 5000)


In [12]:
train_features_array[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
# I need to feed the Isomap an array made of a list of lists (with the vectorized values)!

iso = Isomap(n_components=2)
data_projected = iso.fit_transform(train_features_array)

In [33]:
data_projected[:,1]

array([ 0.50468546, -0.48353731, -0.71127803, ...,  5.18255488,
       -0.21534603, -0.30888089])

In [41]:
import pylab as plt

plt.scatter(data_projected[:, 0], data_projected[:, 1], c="blue",
            edgecolor='none', alpha=0.5);   #, cmap=plt.cm.get_cmap('nipy_spectral', 10)
# plt.colorbar(label='digit label', ticks=range(10))
plt.clim(-0.5, 9.5)
plt.show()

In [None]:
for i in range(len(data_projected)):
    if data_projected[:1]