In [1]:
import numpy as np
import pandas as pd
import requests
import re
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, LabelSet
from bokeh.palettes import Spectral, Viridis, RdPu

output_notebook()

In [2]:
#full text for Thomas Hobbes' Leviathan from project Gutenberg
url = "http://www.gutenberg.org/files/3207/3207.txt" 

leviathan = requests.get(url).text

In [3]:
print(leviathan[0:50]) #first 50 characters

The Project Gutenberg EBook of Leviathan, by Thoma


In [4]:
leviathan_by_line = leviathan.split("\n")
chapter_lines = []
for line_num in range(len(leviathan_by_line)):
    if re.search("CHAPTER ", leviathan_by_line[line_num]):
        ch_title = "".join([str(x).strip() for x in leviathan_by_line[line_num:line_num+3]])
        chapter_lines.append((ch_title, line_num))

In [5]:
corpus = []
start_line, end_line = 0, 0
for i, c in enumerate(chapter_lines):
    start_line = c[1]
    try:
        end_line = chapter_lines[i+1][1] - 2 #next chapter line number minus empty line
    except:
        end_line = len(leviathan_by_line) - 2 # catching the last chapter
    corpus.append(" ".join([str(x).strip() for x in leviathan_by_line[start_line:end_line]]))

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

def token_and_lemmatize(x):
    exclude_list = set(["'s", "'t"])
    tokens = word_tokenize(x.lower())
    stemmer = WordNetLemmatizer() #Use WordNetLemmatizer() for just simplifying words instead of stemming
    return [stemmer.lemmatize(w) for w in tokens if (not w in exclude_list) and re.search('[a-z]', w)]

In [7]:
tfidf_vect = TfidfVectorizer(max_df=0.9, min_df=2, ngram_range=(1,3), 
                             stop_words='english', tokenizer=token_and_lemmatize)
tfidf_matrix = tfidf_vect.fit_transform(corpus)
tfidf_matrix.shape

(47, 11520)

In [8]:
top_n_words_and_score = []
words = tfidf_vect.get_feature_names()

for row in tfidf_matrix:
    row = np.squeeze(row.toarray())
    top_fts_row = np.argsort(row)[::-1][:50]
    top_n_words_and_score.append([(words[i], row[i]) for i in top_fts_row])
    

In [9]:
mean_per_word = np.std(tfidf_matrix.toarray(), axis=0)
top_n_overal = np.argsort(mean_per_word)[::-1][:50]

In [10]:
dist = euclidean_distances(tfidf_matrix)

In [11]:
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1).fit_transform(dist)

In [12]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=40, verbose=0).fit_transform(tfidf_matrix.toarray())

In [13]:
from sklearn.cluster import KMeans

num_clusters = range(1,20)

km = [KMeans(n_clusters=n).fit(dist) for n in num_clusters]
scores = [(n, km[n-1].score(dist)) for n in num_clusters]

clusters = km[3-1].labels_.tolist()

In [14]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.1).fit(dist)
labels = db.labels_

print(len(labels))

47


In [15]:
#Bokeh Options
p = figure(plot_width=960, plot_height=600, tools=[], title="Chapter distances (Tf-Idf)")
p.toolbar_location = None
p.yaxis.axis_line_alpha = 0
p.yaxis.axis_label_text_font = 'Menlo'
p.axis.major_label_text_color = 'grey'
p.axis.minor_tick_out = None
p.axis.major_tick_out = None

#bokeh plot and data

s = ColumnDataSource(data={
    'x' : tsne[:,0],
    'y' : tsne[:,1],
    'ch' : [c[0] for c in chapter_lines],
    'color' : [Viridis[3][x] for x in labels],
    'ch_num' : [i+1 for i in range(len(chapter_lines))],
    'top5' : [", ".join([x[0] for x in i[:10]]) for i in top_n_words_and_score],
    'excerpt' : [" ".join(i[500:800].split(" ")[1:-1]) for i in corpus]
})

p.circle('x', 'y', fill_color='color', alpha=0.8, size=10, 
         line_color=None, hover_fill_color='grey', source=s)
p.add_tools(HoverTool(tooltips="""
    <div style="max-width: 400px;">
        <div style="font-weight: bold;">@ch</div>
        <div>@top5</div>
        <div style="font-style: italic">@excerpt</div>
    </div>
    """, attachment="vertical"))

labels = LabelSet(x='x', y='y', text='ch_num', level='glyph',
            x_offset=10, y_offset=0, source=s, render_mode='canvas', 
            text_color='color', text_font="Menlo", text_font_size="8pt")

p.add_layout(labels)


show(p)

In [16]:
from bokeh.themes import Theme
from bokeh.io import curdoc

p_elbow = figure(plot_width=960, tools=[], title="Sum of squares to centroids per n clusters")


s = ColumnDataSource(data={
    'x' : [s[0] for s in scores],
    'y' : [s[1] for s in scores]
})

p_elbow.circle('x', 'y', source=s, size=10, fill_color='grey')
p_elbow.line('x', 'y', source=s, line_width=2, line_color='grey')

show(p_elbow)


In [17]:
#Bokeh Options
p = figure(plot_width=960, plot_height=600, tools=[], title="Chapter distances (Tf-Idf)")
p.toolbar_location = None
p.yaxis.axis_line_alpha = 0
p.yaxis.axis_label_text_font = 'Menlo'
p.axis.major_label_text_color = 'grey'
p.axis.minor_tick_out = None
p.axis.major_tick_out = None

#bokeh plot and data

s = ColumnDataSource(data={
    'x' : tsne[:,0],
    'y' : tsne[:,1],
    'ch' : [c[0] for c in chapter_lines],
    'color' : [Viridis[3][x] for x in clusters],
    'ch_num' : [i+1 for i in range(len(chapter_lines))],
    'top5' : [", ".join([x[0] for x in i[:10]]) for i in top_n_words_and_score],
    'excerpt' : [" ".join(i[500:800].split(" ")[1:-1]) for i in corpus]
})

p.circle('x', 'y', fill_color='color', alpha=0.8, size=10, 
         line_color=None, hover_fill_color='grey', source=s)
p.add_tools(HoverTool(tooltips="""
    <div style="max-width: 400px;">
        <div style="font-weight: bold;">@ch</div>
        <div>@top5</div>
        <div style="font-style: italic">@excerpt</div>
    </div>
    """, attachment="vertical"))

labels = LabelSet(x='x', y='y', text='ch_num', level='glyph',
            x_offset=10, y_offset=0, source=s, render_mode='canvas', 
            text_color='color', text_font="Menlo", text_font_size="8pt")

p.add_layout(labels)


show(p)

In [18]:
custom_colors = ['#d2d2d2', '#ff5b5b', '#006eff']


p2 = figure(plot_width=960, plot_height=600, tools=[], y_axis_type='linear', 
            title="Word importance per chapter")
p2.toolbar_location = None
p2.yaxis.axis_line_alpha = 0
p2.yaxis.axis_label_text_font = 'Menlo'
p2.axis.major_label_text_color = 'grey'
p2.axis.minor_tick_out = None
p2.axis.major_tick_out = None

p2_source = ColumnDataSource(data={
    'x' : [[i+1 for i in range(len(chapter_lines))] for i in top_n_overal[:10]], # Chapter number
    'y' : [[row[i] for row in tfidf_matrix.toarray()] for i in top_n_overal[:10]],
    'word' : [[words[i]] for i in top_n_overal[:10]]
})

for i in range(47):
    p2.rect(i,0.35,1,0.7, fill_color=Viridis[3][clusters[i]], alpha=0.1, line_color=None)

p2.multi_line('x', 'y', line_color='grey', hover_line_color=Viridis[3][0], 
              line_width=2, alpha=0.7, source=p2_source, name='multi')

p2.add_tools(HoverTool(names=['multi'], tooltips=[
    ('Word', '@word'),
    ('Chapter', '$x{0}')
]))



show(p2)

In [20]:
from bokeh.models import LinearColorMapper
from bokeh.transform import transform

max_words = 40

p3 = figure(plot_width=960, plot_height=700, tools=[],
            y_range=[words[i] for i in top_n_overal[:max_words]], x_range=[str(x+1) for x in range(47)],
            title="Word importance per chapter")

p3.toolbar_location = None
p3.title.text_font = 'Menlo'
p3.title.text_font_size = '16pt'
p3.yaxis.axis_line_alpha = 0
p3.yaxis.major_label_text_font = 'Menlo'
p3.xaxis.major_label_text_font = 'Menlo'
p3.axis.major_tick_in = None
p3.axis.major_tick_out = None
p3.axis.axis_line_color = None
p3.axis.major_tick_line_color = None
p3.axis.major_label_text_font_size = "8pt"
p3.xaxis.major_label_text_align = "left"

p3_source = ColumnDataSource(data={
    'x' : [x+0.5 for x in range(len(chapter_lines)) for i in top_n_overal[:max_words]], # Chapter number
    'y' : [row[i] for row in tfidf_matrix.toarray() for i in top_n_overal[:max_words]],
    'word' : [words[i] for row in tfidf_matrix for i in top_n_overal[:max_words]]
})

mapper = LinearColorMapper(palette=list(reversed(RdPu[9])), low=0.01, high=0.5)

p3.rect('x', 'word', width=1, height=1, fill_color=transform('y', mapper),
        line_color=None, source=p3_source, hover_fill_color='blue')

p3.add_tools(HoverTool(tooltips="""
    <div style="max-width: 400px;">
        <div style="font-weight: bold;">@word</div>
        <div>Chapter: @x{0}</div>
        <div style="font-style: italic">Importance: @y{0.000}</div>
    </div>"""))

show(p3)