In [1]:
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from newspaper import Article


In [2]:
article = pd.read_csv('sample_data.csv')
article.columns = ['idx','tags','text','genre','cluster']
article = article.head(100)
article = article.dropna()
article.head(5)

Unnamed: 0,idx,tags,text,genre,cluster
0,19,d77e674abd042be04a8e644d023764d9,A man jumped to his death from a high-rise con...,Kill by physical assault,3.0
1,44,243b0d67a8bd945bd994b0d6607c2729,"A former Minister of Aviation, Chief Osita Chi...","Threaten, not specified below",3.0
2,45,2de077cba1272d4581f86a1a1cda4d10,North Korean leader Kim Jong Un said the world...,"Threaten, not specified below",3.0
3,95,8e9e70ecd9192803779bca2f85a1ea02,Photo by RobShotsNiagara Falls police and the ...,"Threaten, not specified below",3.0
4,106,c8011ead21923ba8c5b836a9ae1916c5,"DECATUR, Ala – It’s a part of everyday life th...","Threaten, not specified below",3.0


In [3]:
#import three lists: titles, links and wikipedia synopses
titles = article.tags.tolist()

synopses = article.text.tolist()
    
genres = article.genre.tolist()
# print (genres)
print(str(len(titles)) + ' titles')
print(str(len(synopses)) + ' synopses')
print(str(len(genres)) + ' genres')
# synopses[0]

90 titles
90 synopses
90 genres


In [4]:
# generates index for each item in the corpora (in this case it's just rank) and I'll use this for scoring later
ranks = []

for i in range(0,len(titles)):
    ranks.append(i)

# ranks[0]

In [5]:
# load nltk's English stopwords as variable called 'stopwords'
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Faraz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [28]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
# class algorithm for stemming

In [29]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [30]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

# totalvocab_stemmed[0]
# totalvocab_tokenized[0]

In [31]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
# vocab_frame

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=20000,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))



In [33]:
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)

print(tfidf_matrix.shape)

  'stop_words.' % sorted(inconsistent))


(90, 415)


In [34]:
terms = tfidf_vectorizer.get_feature_names()
# terms

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [36]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

clusters = km.labels_.tolist()
len(clusters)

90

In [37]:
print(len(titles))
print(len(ranks))
print(len(synopses))
print(len(clusters))
print(len(genres))

90
90
90
90
90


In [38]:
import pandas as pd

articles = { 'title': titles, 'rank': ranks, 'synopsis': synopses, 'cluster': clusters, 'genre': genres }
#print(articles)
frame = pd.DataFrame(articles, index = [clusters], columns = ['rank', 'title', 'cluster', 'genre'])
frame.head(5)

Unnamed: 0,rank,title,cluster,genre
3,0,d77e674abd042be04a8e644d023764d9,3,Kill by physical assault
4,1,243b0d67a8bd945bd994b0d6607c2729,4,"Threaten, not specified below"
4,2,2de077cba1272d4581f86a1a1cda4d10,4,"Threaten, not specified below"
3,3,8e9e70ecd9192803779bca2f85a1ea02,3,"Threaten, not specified below"
1,4,c8011ead21923ba8c5b836a9ae1916c5,1,"Threaten, not specified below"


In [39]:
frame['cluster'].value_counts()

0    25
4    24
3    14
1    14
2    13
Name: cluster, dtype: int64

In [40]:
grouped = frame['rank'].groupby(frame['cluster'])

grouped.mean()

cluster
0    39.200000
1    45.857143
2    44.846154
3    38.785714
4    52.375000
Name: rank, dtype: float64

In [41]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()
    print()
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: b'said', b'years', b"'s", b'burned', b'days', b'makes',

Cluster 0 titles: 1b6b616b632ed12327d1423251de39d9, 43aeb068b4414c8499eb6f22a22a435e, bda6f231e5ccf15d0b3898fe3b3534cc, 68dd25081e0cc11bffd5e48741ea5bbe, 6debd8d794b2ec34bb8b076efa597340, 7d041fd9c9710d54c0c161c1c87ea2de, 3c55de8260d9dbf9cf2fa6ddcbaf5f40, 86c4a8a7d1ab5ebc4b98c7d642757ec5, bd9212fa62773705f7ed37719278fea0, 45cdbc5be1595b9fd778b6069a757bc4, a4846dbc8b353f9e2685cc67bf7bb7bc, a2ecd4f00f7afb2b27252f55325169ce, dd4008c92e361036fa26f52f877ceff3, f8d7a09260652050e05874e72c82e580, d04f057f3f30d1f2186b024efe0db6ac, f0aecc1bc92ca775e181a94fb1e27126, 2fcba95c162ffb1e03e1e76b8552f9b9, dbd8534cdad4b539645a3daa7bf86e47, 7ffe11eb5d9c22d56a8a8adbf88eeb08, cb40484556f31cbe219683173fe2a07b, bc0bcb2c28a9ac2460f7676fbde7055d, a37d14a78c967c384e3c925e17b99b41, 875f2e38e9e03f0ec0fb653dbc6c77ce, 4d872a1dee8c098c070ee4cdc132712c, 21f16685ef70fb9e9ac3e08a3e250b61,

Cluster 1 words: b's', b'companie

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


 b'wounded', b's',

Cluster 3 titles: d77e674abd042be04a8e644d023764d9, 8e9e70ecd9192803779bca2f85a1ea02, 72115754c244e40847903d9e79a3a26b, 2802f8b0220626280bddb5db9e1535e0, df0d1dd149277cfb48f95863085bb854, 38706d07a0e5f3c6e17411ba5c436937, a90902791e248b0b549921a06478c13d, 324535e0430c934e75a303dbf524beee, 12e4f3f7af27243fdb5db27e180c555d, 252faa70b67ce8184ef7af2a5b9f2463, 5d0e11a6f1569a4dcd095b259d6b1861, cb8d4123458c4a5925ba711d262c5a7c, 3112ef68604e55382df523a8ef5608e5, 633647a6358f5a6411fb8fff0a198190,

Cluster 4 words: b"'s", b'court', b's', b'north', b'reported', b'trump',

Cluster 4 titles: 243b0d67a8bd945bd994b0d6607c2729, 2de077cba1272d4581f86a1a1cda4d10, 979282e85b79a7c6022928034279c82c, 17a418f1179c5b6e9e29273ce9443c57, 4419ab05f77e8c6df2cde35c1cab94ac, 207d1a44171b0a07f712f43c835a04c5, dbf0212d374c663da1e45b599ccf215c, 4a34a369e6466b93f265b150fbcbbe71, 8ac3438fff17a3019330faef46620670, d1a20aa870d4ff28abb0a15511b0a075, 1318dd9afa1da6a2ab6dac224619aa4a, af4d7538f2edc4e7955

# Multidimensional scaling

In [42]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

In [43]:
#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

## Visualizing document clusters

In [44]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'Trump, Hillary', 
                 1: 'Police, killed, murders', 
                 2: 'Father, New York, brothers', 
                 3: 'Dance, singing, love', 
                 4: 'Killed, soldiers, captain'}

In [45]:
%matplotlib inline

In [46]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')


# set up plot
# fig, ax = plt.subplots(figsize=(17, 9)) # set size
# ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

# #iterate through groups to layer the plot
# #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
# for name, group in groups:
#     ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
#     ax.set_aspect('auto')
#     ax.tick_params(\
#         axis= 'x',          # changes apply to the x-axis
#         which='both',      # both major and minor ticks are affected
#         bottom='off',      # ticks along the bottom edge are off
#         top='off',         # ticks along the top edge are off
#         labelbottom='off')
#     ax.tick_params(\
#         axis= 'y',         # changes apply to the y-axis
#         which='both',      # both major and minor ticks are affected
#         left='off',      # ticks along the bottom edge are off
#         top='off',         # ticks along the top edge are off
#         labelleft='off')
    
# ax.legend(numpoints=1)  #show legend with only 1 point

# #add label in x,y position with the label as the film title
# for i in range(len(df)):
#     ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
# plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)

In [47]:
#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}

In [48]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, label=cluster_names[name], mec='none', color=cluster_colors[name])
    ax.set_aspect('auto')
    labels = [i for i in group.title]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1) #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
#html = mpld3.fig_to_html(fig)
#print(html)

In [49]:
plt.close()