In [1]:
%matplotlib inline

import json
import requests
import re
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

stops = set(stopwords.words("english"))

In [2]:
with open("cl_epa.txt", "r") as f:
    epa_links = json.load(f)
with open("cl_nepa.txt", "r") as f:
    nepa_links = json.load(f)

In [3]:
print(epa_links["5199402"][:3])
print(nepa_links["971243"][:3])

['2000s_American_television_series', '2007_American_television_series_debuts', '2008_American_television_series_endings']
['2001_establishments_in_Ohio', 'Alive_Naturalsound_Records_artists', 'American_indie_rock_groups']


In [4]:
def GetWordsLinks(data_dict):
    # To get meaningful stemmed category-link words
    # It accepts a dictionary {keys}:["str1", "str2",..., "strN"]
    # returns a dataframe where page_id is the index and link_stems is "stm stm2 stm3 stm4..."
    
    list_categories = dict()
    for key in data_dict:
        links= []
        for n in range(len(data_dict[key])):

            stop_links = set()
            stop_links={"protected_page","with_accessdate","rticle","ikipedia","to_be_expanded",
                   "with_unsourced_statements","needing_additional_references","lacking_sources",
                   "containing_potentially_dated_statements","with_dead_external_links",
                   "Certification_Table_Entry_usages_for","language_sources","rticles_containing",
                   "page","CS1","dmy_dates","mdy_dates","Use","use","Wiki","Pages"} # "List","list"
            if not any(re.search(regex, data_dict[key][n]) for regex in stop_links):
                link = data_dict[key][n]                                 # "All_articles_in,_Wikipedia..."
                link = re.sub(r"_"," ",link)                             # "All articles in, Wikipedia
                link = re.sub(r"[^A-Za-z ]","",link).lower().split()     # ["all", "articles", "in", "wikipedia", ... ]
                link = [stemmer.stem(w) for w in link if w not in stops] # ["articl", "wikipedia", ... ]
                link = " ".join(link)                                    # "articl wikipedia .." 
                links.append(link)
        links=" ".join(links)
        list_categories[key]=links
    df_links = pd.DataFrame.from_dict(list_categories, orient="index")  # 
    df_links.index.name = 'article_id'
    df_links.columns = ['links_stems']
    return df_links

In [5]:
epa_stem_links = GetWordsLinks(epa_links)
nepa_stem_links = GetWordsLinks(nepa_links)

In [19]:
print(nepa_stem_links["links_stems"]["971243"])
print(nepa_links["971243"])

establish ohio aliv naturalsound record artist american indi rock group american music duo blue rock group brit award winner fat possum record artist grammi award winner music group establish music group akron ohio nonesuch record artist rock music duo suicid squeez record artist v record artist
['2001_establishments_in_Ohio', 'Alive_Naturalsound_Records_artists', 'American_indie_rock_groups', 'American_musical_duos', 'Articles_with_hCards', 'Blues_rock_groups', 'Brit_Award_winners', 'CS1_maint:_Multiple_names:_authors_list', 'Fat_Possum_Records_artists', 'Grammy_Award_winners', 'Musical_groups_established_in_2001', 'Musical_groups_from_Akron,_Ohio', 'Nonesuch_Records_artists', 'Rock_music_duos', 'Suicide_Squeeze_Records_artists', 'Use_mdy_dates_from_June_2013', 'V2_Records_artists', 'Wikipedia_articles_with_BNF_identifiers', 'Wikipedia_articles_with_GND_identifiers', 'Wikipedia_articles_with_ISNI_identifiers', 'Wikipedia_articles_with_LCCN_identifiers', 'Wikipedia_articles_with_MusicB

In [None]:
print(epa_stem_links["links_stems"]["307"])
print(nepa_stem_links["links_stems"]["971243"])

In [20]:
def VectorizeMostCommonFeatures(dataframe, num_features):
    """Identifies the num_features most common features from a dataframe generated by ScrapeAndStemIntros
    containing space-delimited stemmed strings.
    """
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features = num_features)
    vec_words = vectorizer.fit_transform(dataframe["links_stems"])
    vocab = vectorizer.get_feature_names()
    count_vocab = vec_words.toarray()
    count_df = pd.DataFrame(count_vocab, columns=vocab, index=dataframe.index)
    return count_df

In [21]:
epa_link_count = VectorizeMostCommonFeatures(epa_stem_links, 5000)
nepa_link_count = VectorizeMostCommonFeatures(nepa_stem_links, 5000)
nepa_link_count.shape

(10026, 5000)

In [22]:
nepa_with_links = nepa_link_count[nepa_link_count.sum(axis=1) > 0].index   # list of ids not empty  7237 of 7524
nepa_vecs_with_links = nepa_link_count[nepa_link_count.sum(axis=1) > 0].as_matrix()  # matrix of the articles with links
print(nepa_with_links.shape, nepa_vecs_with_links.shape)

(9897,) (9897, 5000)


In [23]:
counts = np.sum(nepa_link_count, axis=0)   # how much a word appears
print(counts.shape)
nepa_with_links.shape

(5000,)


(9897,)

In [24]:
def ProcessWordOccurrences(vec_count_df):
    '''Given a dataframe (rows: articles, cols: word counts) containg the output of VectorizeMostCommonFeatures, output a dataframe
    containing the number of occurrences of each word across the dataset, the number of articles with at least
    one occurrence of the word, and the fraction of articles with at least once occurrence.'''
    vocab = vec_count_df.columns.values
    count_by_word = np.sum(vec_count_df, axis=0)
    
    present = vec_count_df > np.zeros(vec_count_df.shape)
    present_by_word = np.sum(present, axis=0)
    present_by_word_frac = present_by_word / vec_count_df.shape[0]
    
    df = pd.concat([count_by_word,present_by_word,present_by_word_frac],axis=1)
    df.columns = ['occurences','articles','frac_of_articles']
    return df

In [25]:
epa_occurrences = ProcessWordOccurrences(epa_link_count)
nepa_occurrences = ProcessWordOccurrences(nepa_link_count)

In [26]:
occ_all = pd.merge(epa_occurrences, nepa_occurrences, how="outer", left_index=True, right_index=True, suffixes=('_epa','_nepa'), indicator=True)
occ_all.shape

(6687, 7)

In [27]:
occ_all["occurences_epa"][1540:1550]

dachau       NaN
dagestan     5.0
dahl         3.0
dai          4.0
daiei        NaN
daili       11.0
dairi        4.0
dakota      28.0
dal          3.0
dalla       45.0
Name: occurences_epa, dtype: float64

========================================

Let's try and get clustering on EPAs

In [28]:
nepa_link_count.columns.values[3200:3220]

array(['offici', 'ohio', 'oil', 'oiler', 'okinawa', 'oklahoma', 'old',
       'oldham', 'olymp', 'olympiaco', 'olympiqu', 'omaha', 'oman', 'one',
       'onlin', 'ono', 'ontario', 'open', 'opendomesday', 'oper'], dtype=object)

In [None]:
# word = "oak"
# anyid = [art for art in epa_link_count[word].index if epa_link_count[word][str(art)] != 0]
# anyid

In [30]:
nepa_norms = np.linalg.norm(nepa_vecs_with_links, axis=1) #calculates norms of row vectors  (7237,)
nepa_vecs_normed = nepa_vecs_with_links / nepa_norms[:,None] #unitizes row vectors           (7237,5000)

In [32]:
nepa_vecs_normed.shape

(9897, 5000)

In [33]:
distances = 180 * np.arccos(np.clip(np.dot(nepa_vecs_normed,nepa_vecs_normed.T),-1.0, 1.0)) / np.pi #calculates the angle
#in degrees between each pair of articles      (7237,7237)
distances.shape

(9897, 9897)

In [34]:
nepa_article_distances = pd.DataFrame(distances, index=nepa_with_links, columns=nepa_with_links) # indexes added
nepa_article_distances.shape

(9897, 9897)

In [35]:
def GetMostSimilar(article_id, numrecords=10):
    '''Given an article ID, return a list of the most similar (i.e., lowest angle) articles in the dataset'''
    return nepa_article_distances[str(article_id)].sort_values()[:numrecords+1]

GetMostSimilar(971243, 10)  #971243 for  a nepa

article_id
971243       0.000000
193328      36.679215
2246167     36.722060
3316009     41.407289
579441      41.407289
2003582     42.186865
392096      42.274320
453107      43.132955
6599787     43.210787
28337780    43.671003
163883      43.718500
Name: 971243, dtype: float64

In [None]:
#distance scaling function, engineered to increase slowly up to 70 and rapidly thereafter

x_plot = np.arange(0,90,0.5)
y_plot = (300 + x_plot) / (85 - x_plot) 
plt.plot(x_plot, y_plot)
# plt.xlim((0,70))
plt.ylim((0,50))
plt.show

In [None]:
scaled_distances = (300 + distances) / (85 - distances)
scaled_distances[scaled_distances <= 0] = 400 #sets negative values (i.e., angles above 85) to an arbitrarily large distance
scaled_distances.shape

In [None]:
scaled_distances.shape

In [56]:
from sklearn.cluster import DBSCAN

link_DBSCAN = DBSCAN(eps=45, min_samples=40, metric='precomputed')   # "scaled_distances" is the "precomputed" metric
link_labels = link_DBSCAN.fit_predict(distances)

In [57]:
n_clusters = len(set(link_labels)) - (1 if -1 in link_labels else 0)
n_clusters

16

In [58]:
print(set(link_labels))
print(link_labels.shape)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1}
(9897,)


In [59]:
print(nepa_with_links)
print(nepa_vecs_normed)  # Now I just need to find a way to visualize this: merging the link_labels with epa_vecs_normed 
# or maybe it's better to visualize the not normed????

Index(['47159030', '42880738', '2616911', '12880376', '8916716', '42508813',
       '309620', '8680627', '720558', '9598063',
       ...
       '272874', '15290967', '41960962', '23140164', '20789915', '32190061',
       '2061626', '39915844', '39803111', '1260189'],
      dtype='object', name='article_id', length=9897)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [60]:
def CharacterizeClassPop(topic_labels):
    '''Given an array of class labels (the output from DBSCAN.fit_predict()), determine how many
    non-noise clusters (i.e., some number besides -1) were assigned, what fraction of articles were assigned to a
    non-noise cluster, and the average, median, and max size of the population of clusters. Returns a dict.'''
    classCounts = np.array(np.unique(topic_labels, return_counts=True)).T #each row contains the class label and the number of occurrences
    assignedClasses = classCounts[1:]
    numClust = assignedClasses.shape[0]
    fracInClust = np.sum(assignedClasses[:,1]) / np.sum(classCounts[:,1])
    avgClustSize = np.sum(assignedClasses[:,1]) / numClust
    medianClustSize = np.median(assignedClasses[:,1])
    maxClustSize = np.max(assignedClasses[:,1])
    
    return {'numClusters':numClust,
           'coverage':fracInClust,
           'avgClusterSize':avgClustSize,
           'medianClusterSize':medianClustSize,
           'maxClusterSize':maxClustSize}

In [61]:
link_labels[:3]

array([-1, -1, -1], dtype=int64)

In [62]:
set(link_labels)
id_topic = pd.DataFrame(link_labels, index=nepa_with_links, columns=['DB_class'])
print(id_topic[:3])
id_topic.shape

            DB_class
article_id          
47159030          -1
42880738          -1
2616911           -1


(9897, 1)

In [63]:
vec_words_by_class = pd.merge(nepa_link_count, id_topic, how='inner', left_index=True, right_index=True)

In [64]:
print(nepa_link_count.shape)
print(vec_words_by_class.shape)
id_topic.index = id_topic.index.map(int)
print(id_topic.index)

(10026, 5000)
(9897, 5001)
Int64Index([47159030, 42880738,  2616911, 12880376,  8916716, 42508813,
              309620,  8680627,   720558,  9598063,
            ...
              272874, 15290967, 41960962, 23140164, 20789915, 32190061,
             2061626, 39915844, 39803111,  1260189],
           dtype='int64', length=9897)


In [65]:
classCount = np.array(np.unique(link_labels, return_counts=True)).T
classCount

array([[  -1, 8040],
       [   0,  207],
       [   1,   62],
       [   2,   81],
       [   3,  390],
       [   4,   96],
       [   5,  218],
       [   6,  194],
       [   7,  104],
       [   8,  119],
       [   9,   41],
       [  10,   56],
       [  11,   83],
       [  12,   60],
       [  13,   54],
       [  14,   51],
       [  15,   41]], dtype=int64)

In [None]:
epa_articles = pd.read_csv("070916_edit_protected_articles.csv")

In [None]:
epa_titles = epa_articles[['page_id','page_title']]
epa_titles.index = epa_titles['page_id']
del epa_titles['page_id']
epa_titles[:10]

In [None]:
epa_titles.index

In [None]:
titles_by_class = pd.merge(epa_titles, id_topic, how='inner', left_index=True, right_index=True)
titles_by_class[:10]

In [90]:
def InfoAboutEachClass(topic_labels):
    '''Given an array of class labels (the output from DBSCAN.fit_predict()), select <= 10 representative
    articles from each class, and the 10 most frequently present words from that cluster. Returns a nested dict.'''
    id_topic = pd.DataFrame(topic_labels, index=nepa_with_links, columns=['DB_class'])
    vec_words_by_class = pd.merge(nepa_link_count, id_topic,
                                 how='inner', left_index=True, right_index=True)
    
    id_topic.index = id_topic.index.map(int)
    #titles_by_class = pd.merge(epa_titles, id_topic,                            # comment this away for nepas
    #                           how='inner', left_index=True, right_index=True)
    
    classInfo = dict()
    for k in np.unique(topic_labels)[:].tolist():        #  k over the set() of labels (excluded -1)  [array-> list]
        classInfo[k] = dict()                             
        k_words = vec_words_by_class[vec_words_by_class['DB_class'] == k]  # vectors for the class k
        classMem = k_words.shape[0]                                        # number of vec in class k
        classInfo[k]['class_members'] = classMem                           # Print-out
        present = k_words > np.zeros(k_words.shape)                        # boolean 2D array
        present_by_word = np.sum(present, axis=0)                          # sum over class of individual words
        present_by_word_frac = present_by_word / k_words.shape[0]          # fraction = occurrences/#class-members
        classInfo[k]['top_words'] = present_by_word_frac.sort_values(ascending=False)[:10].to_dict()  # 
        
        sampSize = min(classMem, 10)
        #k_titles = titles_by_class[titles_by_class['DB_class'] == k].sample(n=sampSize)  #
        #classInfo[k]['rep_titles'] = k_titles['page_title'].tolist()                     # comment these away on nepas
    return classInfo

In [91]:
result = InfoAboutEachClass(link_labels)

In [92]:
result[0]["class_members"]

207

In [104]:
cluster=-1
for words in sorted(result[cluster]["top_words"], key=result[cluster]["top_words"].get, reverse=True):
    print("{}: {:.3f}".format(words, result[cluster]['top_words'][words]))

birth: 0.235
peopl: 0.226
state: 0.132
unit: 0.132
live: 0.128
death: 0.117
stub: 0.114
establish: 0.113
american: 0.111
counti: 0.091


In [None]:
def PrintInfoAboutEachClass(topic_labels):
    '''Given an array of class labels (the output from DBSCAN.fit_predict()), run InfoAboutEachClass() and format
    its output for printing. Returns a formatted string.'''
    
    printInfo = InfoAboutEachClass(topic_labels)
    report = ""
    for key in sorted(printInfo):
        report += "========================\n"
        report += "CLUSTER {}: {} MEMBERS\n\n".format(key, printInfo[key]['class_members'])
        
        report += "Representative articles:\n"
        for article in printInfo[key]['rep_titles']:
            report += article
            report += "\n"
        report += "\nMost common terms:\n"
        for word in sorted(printInfo[key]['top_words'], key=printInfo[key]['top_words'].get, reverse=True):
            report += "{}: {:.3f}\n".format(word, printInfo[key]['top_words'][word])
        report += "========================\n"
    return report

In [None]:
def RunAndAnalyzeDBSCAN(eps, min_samples, scaled_distances):
    '''Given a value for epsilon, the minimum number of samples required for a core point, and a matrix of
    precomputed distances, create a DBSCAN classifier, use it to fit and predict based on the supplied distances, 
    and analyze the output using CharacterizeClassPop and PrintInfoAboutEachClass. Returns a csv formatted string
    containing epsilon, min_samples, numClusters, coverage, avgClusterSize, medianClusterSize, and maxClusterSize'''
    from sklearn.cluster import DBSCAN
    clust = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    topic_labels = clust.fit_predict(scaled_distances)
    numClusters = np.unique(topic_labels).shape[0]
    if numClusters > 1:
        cp = CharacterizeClassPop(topic_labels)
        report = PrintInfoAboutEachClass(topic_labels)
    
        reportFN = "candidate_clusters/{}_{}_clustReport.txt".format(eps, min_samples)
        with open(reportFN, "w") as f:
            f.write(report)
    
        return "{},{},{},{:.3f},{:.0f},{:.0f},{}".format(eps,min_samples,cp['numClusters'],cp['coverage'],cp['avgClusterSize'],
                                        cp['medianClusterSize'],cp['maxClusterSize'])
    else:
        return "{},{},0,0,0,0,0".format(eps,min_samples)

In [None]:
candidate_eps = range(2,13)
candidate_samples = [2,5,10,20,50,100]
with open("DBSCAN_metrics.txt","w") as d:
    d.write("epsilon,min_samples,num_clusters,coverage,avg_cluster_size,median_cluster_size,max_cluster_size\n")
    for eps in candidate_eps:
        for samples in candidate_samples:
            output = RunAndAnalyzeDBSCAN(eps, samples, scaled_distances)
            d.write(output)
            d.write("\n")

ok .. I'm lost!

In [None]:
print(PrintInfoAboutEachClass(link_labels))

In [None]:
analisi=PrintInfoAboutEachClass(link_labels)

In [None]:
with open("cl_nepa_noScale_eps40_min40.txt", "w") as f:
    f.write(analisi)