In [1]:
%matplotlib inline

import json
import requests
import re
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

stops = set(stopwords.words("english"))

In [2]:
with open("cl_epa.txt", "r") as f:
    epa_links = json.load(f)
with open("cl_nepa.txt", "r") as f:
    nepa_links = json.load(f)

In [3]:
print(epa_links["5199402"][:3])
print(nepa_links["971243"][:3])

['2000s_American_television_series', '2007_American_television_series_debuts', '2008_American_television_series_endings']
['2001_establishments_in_Ohio', 'Alive_Naturalsound_Records_artists', 'American_indie_rock_groups']


In [4]:
def GetWordsLinks(data_dict):
    # To get meaningful stemmed category-link words
    # It accepts a dictionary {keys}:["str1", "str2",..., "strN"]
    # returns a dataframe where page_id is the index and link_stems is "stm stm2 stm3 stm4..."
    
    list_categories = dict()
    for key in data_dict:
        links= []
        for n in range(len(data_dict[key])):

            stop_links = set()
            stop_links={"protected_page","with_accessdate","rticle","ikipedia","to_be_expanded",
                   "with_unsourced_statements","needing_additional_references","lacking_sources",
                   "containing_potentially_dated_statements","with_dead_external_links",
                   "Certification_Table_Entry_usages_for","language_sources","rticles_containing",
                   "page","CS1","mdy_dates","Wiki","Pages","List","list"}
            if not any(re.search(regex, data_dict[key][n]) for regex in stop_links):
                link = data_dict[key][n]                                 # "All_articles_in,_Wikipedia..."
                link = re.sub(r"_"," ",link)                             # "All articles in, Wikipedia
                link = re.sub(r"[^A-Za-z ]","",link).lower().split()     # ["all", "articles", "in", "wikipedia", ... ]
                link = [stemmer.stem(w) for w in link if w not in stops] # ["articl", "wikipedia", ... ]
                link = " ".join(link)                                    # "articl wikipedia .." 
                links.append(link)
        links=" ".join(links)
        list_categories[key]=links
    df_links = pd.DataFrame.from_dict(list_categories, orient="index")  # 
    df_links.index.name = 'article_id'
    df_links.columns = ['links_stems']
    return df_links

In [5]:
epa_stem_links = GetWordsLinks(epa_links)
nepa_stem_links = GetWordsLinks(nepa_links)

In [47]:
print(epa_stem_links["links_stems"]["713342"])
print(epa_links["713342"])

fiction american peopl english descent fiction adopte fiction charact introduc fiction child sexual abus victim fiction murder fiction newspap publish fiction rapist fiction report fiction twin gener hospit charact one life live charact
['Featured_articles', 'Fictional_American_people_of_English_descent', 'Fictional_adoptees', 'Fictional_characters_introduced_in_1992', 'Fictional_child_sexual_abuse_victims', 'Fictional_murderers', 'Fictional_newspaper_publishers', 'Fictional_rapists', 'Fictional_reporters', 'Fictional_twins', 'General_Hospital_characters', 'One_Life_to_Live_characters', 'Wikipedia_pages_semi-protected_from_banned_users']


In [6]:
print(epa_stem_links["links_stems"]["307"])
print(nepa_stem_links["links_stems"]["971243"])

birth death thcenturi american politician thcenturi christian ac element abraham lincoln american classic liber american peopl english descent american postmast assassin presid unit state assassin head state burial oak ridg cemeteri death firearm washington dc hall fame great american inducte illinoi republican illinoi whig illinoi lawyer lincoln famili member illinoi hous repres member unit state hous repres illinoi peopl cole counti illinoi peopl laru counti kentucki peopl macon counti illinoi peopl spencer counti indiana peopl murder washington dc peopl illinoi american civil war peopl mood disord polit parti founder politician springfield illinoi presid unit state republican parti unit state presidenti nomine republican parti presid unit state smallpox survivor union polit leader unit state presidenti candid unit state presidenti candid use mdi date januari whig parti member unit state hous repres
establish ohio aliv naturalsound record artist american indi rock group american musi

In [None]:
def VectorizeMostCommonFeatures(dataframe, num_features):
    """Identifies the num_features most common features from a dataframe generated by ScrapeAndStemIntros
    containing space-delimited stemmed strings.
    """
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features = num_features)
    vec_words = vectorizer.fit_transform(dataframe["links_stems"])
    vocab = vectorizer.get_feature_names()
    count_vocab = vec_words.toarray()
    count_df = pd.DataFrame(count_vocab, columns=vocab, index=dataframe.index)
    return count_df

In [None]:
epa_link_count = VectorizeMostCommonFeatures(epa_stem_links, 5000)
nepa_link_count = VectorizeMostCommonFeatures(nepa_stem_links, 5000)
epa_link_count.shape

In [None]:
epa_with_links = epa_link_count[epa_link_count.sum(axis=1) > 0].index   # list of ids not empty  7237 of 7524
epa_vecs_with_links = epa_link_count[epa_link_count.sum(axis=1) > 0].as_matrix()  # matrix of the articles with links
print(epa_with_links.shape, epa_vecs_with_links.shape)

In [None]:
counts = np.sum(epa_link_count, axis=0)   # how much a word appears
print(counts.shape)
epa_with_links.shape

In [None]:
def ProcessWordOccurrences(vec_count_df):
    '''Given a dataframe (rows: articles, cols: word counts) containg the output of VectorizeMostCommonFeatures, output a dataframe
    containing the number of occurrences of each word across the dataset, the number of articles with at least
    one occurrence of the word, and the fraction of articles with at least once occurrence.'''
    vocab = vec_count_df.columns.values
    count_by_word = np.sum(vec_count_df, axis=0)
    
    present = vec_count_df > np.zeros(vec_count_df.shape)
    present_by_word = np.sum(present, axis=0)
    present_by_word_frac = present_by_word / vec_count_df.shape[0]
    
    df = pd.concat([count_by_word,present_by_word,present_by_word_frac],axis=1)
    df.columns = ['occurences','articles','frac_of_articles']
    return df

In [None]:
epa_occurrences = ProcessWordOccurrences(epa_link_count)
nepa_occurrences = ProcessWordOccurrences(nepa_link_count)

In [None]:
occ_all = pd.merge(epa_occurrences, nepa_occurrences, how="outer", left_index=True, right_index=True, suffixes=('_epa','_nepa'), indicator=True)
occ_all.shape

In [None]:
occ_all["occurences_epa"][1540:1550]

========================================

Let's try and get clustering on EPAs

In [None]:
epa_link_count.columns.values[3200:3220]

In [None]:
# word = "oak"
# anyid = [art for art in epa_link_count[word].index if epa_link_count[word][str(art)] != 0]
# anyid

In [None]:
epa_norms = np.linalg.norm(epa_vecs_with_links, axis=1) #calculates norms of row vectors  (7237,)
epa_vecs_normed = epa_vecs_with_links / epa_norms[:,None] #unitizes row vectors           (7237,5000)

In [None]:
epa_vecs_normed.shape

In [None]:
distances = 180 * np.arccos(np.clip(np.dot(epa_vecs_normed,epa_vecs_normed.T),-1.0, 1.0)) / np.pi #calculates the angle
#in degrees between each pair of articles      (7237,7237)
distances.shape

In [None]:
epa_article_distances = pd.DataFrame(distances, index=epa_with_links, columns=epa_with_links) # indexes added
epa_article_distances.shape

In [None]:
def GetMostSimilar(article_id, numrecords=10):
    '''Given an article ID, return a list of the most similar (i.e., lowest angle) articles in the dataset'''
    return epa_article_distances[str(article_id)].sort_values()[:numrecords+1]

GetMostSimilar(25, 10)  #971243 for  a nepa

In [None]:
#distance scaling function, engineered to increase slowly up to 70 and rapidly thereafter

x_plot = np.arange(0,90,0.5)
y_plot = (300 + x_plot) / (85 - x_plot) 
plt.plot(x_plot, y_plot)
# plt.xlim((0,70))
plt.ylim((0,50))
plt.show

In [None]:
scaled_distances = (300 + distances) / (85 - distances)
scaled_distances[scaled_distances <= 0] = 400 #sets negative values (i.e., angles above 85) to an arbitrarily large distance
scaled_distances.shape

In [None]:
scaled_distances.shape

In [None]:
from sklearn.cluster import DBSCAN

link_DBSCAN = DBSCAN(eps=40, min_samples=40, metric='precomputed')   # "scaled_distances" is the "precomputed" metric
link_labels = link_DBSCAN.fit_predict(distances)

In [None]:
n_clusters = len(set(link_labels)) - (1 if -1 in link_labels else 0)
n_clusters

In [None]:
print(set(link_labels))
print(link_labels.shape)

In [None]:
print(epa_with_links)
print(epa_vecs_normed)  # Now I just need to find a way to visualize this: merging the link_labels with epa_vecs_normed 
# or maybe it's better to visualize the not normed????

In [None]:
def CharacterizeClassPop(topic_labels):
    '''Given an array of class labels (the output from DBSCAN.fit_predict()), determine how many
    non-noise clusters (i.e., some number besides -1) were assigned, what fraction of articles were assigned to a
    non-noise cluster, and the average, median, and max size of the population of clusters. Returns a dict.'''
    classCounts = np.array(np.unique(topic_labels, return_counts=True)).T #each row contains the class label and the number of occurrences
    assignedClasses = classCounts[1:]
    numClust = assignedClasses.shape[0]
    fracInClust = np.sum(assignedClasses[:,1]) / np.sum(classCounts[:,1])
    avgClustSize = np.sum(assignedClasses[:,1]) / numClust
    medianClustSize = np.median(assignedClasses[:,1])
    maxClustSize = np.max(assignedClasses[:,1])
    
    return {'numClusters':numClust,
           'coverage':fracInClust,
           'avgClusterSize':avgClustSize,
           'medianClusterSize':medianClustSize,
           'maxClusterSize':maxClustSize}

In [None]:
link_labels[:3]

In [None]:
set(link_labels)
id_topic = pd.DataFrame(link_labels, index=epa_with_links, columns=['DB_class'])
print(id_topic[:3])
id_topic.shape

In [None]:
vec_words_by_class = pd.merge(epa_link_count, id_topic, how='inner', left_index=True, right_index=True)

In [None]:
print(epa_link_count.shape)
print(vec_words_by_class.shape)
id_topic.index = id_topic.index.map(int)
print(id_topic.index)

In [None]:
classCount = np.array(np.unique(link_labels, return_counts=True)).T
classCount

In [None]:
epa_articles = pd.read_csv("070916_edit_protected_articles.csv")

In [None]:
epa_titles = epa_articles[['page_id','page_title']]
epa_titles.index = epa_titles['page_id']
del epa_titles['page_id']
epa_titles[:10]

In [None]:
epa_titles.index

In [None]:
titles_by_class = pd.merge(epa_titles, id_topic, how='inner', left_index=True, right_index=True)
titles_by_class[:10]

In [None]:
def InfoAboutEachClass(topic_labels):
    '''Given an array of class labels (the output from DBSCAN.fit_predict()), select <= 10 representative
    articles from each class, and the 10 most frequently present words from that cluster. Returns a nested dict.'''
    id_topic = pd.DataFrame(topic_labels, index=epa_with_links, columns=['DB_class'])
    vec_words_by_class = pd.merge(epa_link_count, id_topic,
                                 how='inner', left_index=True, right_index=True)
    
    id_topic.index = id_topic.index.map(int)
    titles_by_class = pd.merge(epa_titles, id_topic,                            # comment this away for nepas
                               how='inner', left_index=True, right_index=True)
    
    classInfo = dict()
    for k in np.unique(topic_labels)[1:].tolist():        #  k over the set() of labels (excluded -1)  [array-> list]
        classInfo[k] = dict()                             
        k_words = vec_words_by_class[vec_words_by_class['DB_class'] == k]  # vectors for the class k
        classMem = k_words.shape[0]                                        # number of vec in class k
        classInfo[k]['class_members'] = classMem                           # Print-out
        present = k_words > np.zeros(k_words.shape)                        # boolean 2D array
        present_by_word = np.sum(present, axis=0)                          # sum over class of individual words
        present_by_word_frac = present_by_word / k_words.shape[0]          # fraction = occurrences/#class-members
        classInfo[k]['top_words'] = present_by_word_frac.sort_values(ascending=False)[:20].to_dict()  # 
        
        sampSize = min(classMem, 10)
        k_titles = titles_by_class[titles_by_class['DB_class'] == k].sample(n=sampSize)  #
        classInfo[k]['rep_titles'] = k_titles['page_title'].tolist()                     # comment these away on nepas
    return classInfo

In [None]:
result = InfoAboutEachClass(link_labels)

In [None]:
result[8]["class_members"]

In [None]:
result[8]["top_words"]

In [None]:
def PrintInfoAboutEachClass(topic_labels):
    '''Given an array of class labels (the output from DBSCAN.fit_predict()), run InfoAboutEachClass() and format
    its output for printing. Returns a formatted string.'''
    
    printInfo = InfoAboutEachClass(topic_labels)
    report = ""
    for key in sorted(printInfo):
        report += "========================\n"
        report += "CLUSTER {}: {} MEMBERS\n\n".format(key, printInfo[key]['class_members'])
        
        report += "Representative articles:\n"
        for article in printInfo[key]['rep_titles']:
            report += article
            report += "\n"
        report += "\nMost common terms:\n"
        for word in sorted(printInfo[key]['top_words'], key=printInfo[key]['top_words'].get, reverse=True):
            report += "{}: {:.3f}\n".format(word, printInfo[key]['top_words'][word])
        report += "========================\n"
    return report

In [None]:
def RunAndAnalyzeDBSCAN(eps, min_samples, scaled_distances):
    '''Given a value for epsilon, the minimum number of samples required for a core point, and a matrix of
    precomputed distances, create a DBSCAN classifier, use it to fit and predict based on the supplied distances, 
    and analyze the output using CharacterizeClassPop and PrintInfoAboutEachClass. Returns a csv formatted string
    containing epsilon, min_samples, numClusters, coverage, avgClusterSize, medianClusterSize, and maxClusterSize'''
    from sklearn.cluster import DBSCAN
    clust = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    topic_labels = clust.fit_predict(scaled_distances)
    numClusters = np.unique(topic_labels).shape[0]
    if numClusters > 1:
        cp = CharacterizeClassPop(topic_labels)
        report = PrintInfoAboutEachClass(topic_labels)
    
        reportFN = "candidate_clusters/{}_{}_clustReport.txt".format(eps, min_samples)
        with open(reportFN, "w") as f:
            f.write(report)
    
        return "{},{},{},{:.3f},{:.0f},{:.0f},{}".format(eps,min_samples,cp['numClusters'],cp['coverage'],cp['avgClusterSize'],
                                        cp['medianClusterSize'],cp['maxClusterSize'])
    else:
        return "{},{},0,0,0,0,0".format(eps,min_samples)

In [None]:
candidate_eps = range(2,13)
candidate_samples = [2,5,10,20,50,100]
with open("DBSCAN_metrics.txt","w") as d:
    d.write("epsilon,min_samples,num_clusters,coverage,avg_cluster_size,median_cluster_size,max_cluster_size\n")
    for eps in candidate_eps:
        for samples in candidate_samples:
            output = RunAndAnalyzeDBSCAN(eps, samples, scaled_distances)
            d.write(output)
            d.write("\n")

ok .. I'm lost!

In [None]:
print(PrintInfoAboutEachClass(link_labels))

In [None]:
analisi=PrintInfoAboutEachClass(link_labels)

In [None]:
with open("cl_nepa_noScale_eps40_min40.txt", "w") as f:
    f.write(analisi)