In [42]:
import numpy as np
import pandas as pd
import requests
import re
from bokeh.io import show, output_notebook, curdoc, output_file
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, LabelSet, LinearColorMapper
from bokeh.transform import transform
from bokeh.palettes import Spectral, Viridis, RdPu

colors_BuPu = ["#FFFFFF","#FFE5EB","#FFCCE4","#FFB2E9","#FF99FA","#E67FFF","#BD66FF","#874CFF",
          "#4532FF","#193AFF","#0062FF"]

colors = ["#FFFFFF", "#ACCCFF","#7DAFFF","#0062FF", "#1771FF","#00378F","#002B6E"]
colors_BuOr = ["#ACCCFF","#7DAFFF","#1771FF","#00378F","#002B6E","#FFEDA4","#FFE370","#FFCD00",
            "#D9AE00","#A78600","#FFDEA4","#FFCB70","#FFA200","#D98A00","#A76A00"]

output_notebook()

In [2]:
#full text for Thomas Hobbes' Leviathan from project Gutenberg
url = "http://www.gutenberg.org/files/3207/3207.txt" 

leviathan = requests.get(url).text
print(leviathan[0:50]) #first 50 characters


The Project Gutenberg EBook of Leviathan, by Thoma


In [3]:
leviathan_by_line = leviathan.split("End of the Project Gutenberg EBook")[0].split("\n")
chapter_lines = []
for line_num in range(len(leviathan_by_line)):
    if re.search("CHAPTER ", leviathan_by_line[line_num]):
        ch_title = "".join([str(x).strip() for x in leviathan_by_line[line_num:line_num+3]])
        chapter_lines.append((ch_title, line_num))
        
corpus = []
start_line, end_line = 0, 0
for i, c in enumerate(chapter_lines):
    start_line = c[1]
    try:
        end_line = chapter_lines[i+1][1] - 2 #next chapter line number minus empty line
    except:
        end_line = len(leviathan_by_line) - 2 # catching the last chapter
    corpus.append(" ".join([str(x).strip() for x in leviathan_by_line[start_line:end_line]]))

In [18]:
p1 = figure(plot_width=960, plot_height=400, tools=[],
    x_range=["CH. {}".format(i+1) for i in range(len(corpus))],
    y_range=(0,15500),
    title="Number of words per chapter"
)

p1.toolbar_location = None
p1.xaxis.major_label_orientation = 45
p1.yaxis.axis_line_alpha = 0
p1.axis.major_label_text_font = 'Menlo'
p1.axis.major_label_text_color = '#666666'
p1.axis.minor_tick_out = None
p1.axis.major_tick_out = None
p1.xgrid.grid_line_color = None



s = ColumnDataSource({
    'ch' : ["CH. {}".format(i+1) for i in range(len(corpus))],
    'length' : [len(x.split(" ")) for x in corpus],
    'excerpt' : [" ".join(i[500:800].split(" ")[1:-1]) for i in corpus],
    'ch_title' : [c[0] for c in chapter_lines],
})
p1.rect('ch', 1, 0.9, 'length', fill_color=colors[-3], line_color=None, source=s, 
        hover_fill_color="#cccccc", hover_line_color=None)

p1.add_tools(HoverTool(tooltips="""
    <div style="max-width: 450px;">
        <div style="font-weight: bold;">@ch_title</div>
        <div style="font-style: italic">Excerpt: "@excerpt ...”</div>
    </div>
    """, anchor="top_right", point_policy="follow_mouse"))

output_file('output/number_of_words_per_chapter.html')

show(p1)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(stop_words='english')
tf = vect.fit_transform(corpus)
tf.shape

(47, 9288)

In [7]:
#most common words
vocab = vect.get_feature_names()
total_word_count = tf.sum(axis=0).tolist()[0]
sort_by_count = np.argsort(tf.sum(axis=0))[:,::-1].tolist()[0]
for x in sort_by_count[:10]:
    print(vocab[x], "-", total_word_count[x])    

god - 1061
man - 1058
men - 990
power - 733
law - 662
common - 622
shall - 580
soveraign - 494
hath - 445
say - 444


In [17]:
top_n = 30

p2 = figure(plot_width=960, plot_height=400, tools=[],
            x_range=[str(x+1) for x in range(len(chapter_lines))],
            y_range=[vocab[i] for i in sort_by_count[:top_n]],
           title="Most common words per chapter in Thomas Hobbes' Leviathan")

p2.toolbar_location = None
p2.yaxis.axis_line_alpha = 0
p2.axis.major_label_text_color = '#666666'
p2.axis.minor_tick_out = None
p2.axis.major_tick_out = None
p2.axis.major_tick_in = None
p2.axis.major_label_text_font = "Menlo"
p2.xgrid.grid_line_color = None



s = ColumnDataSource({
    'ch' : [x+0.5 for x in range(len(chapter_lines)) for i in sort_by_count[:top_n]],
    'word' : [vocab[i] for row in tf for i in sort_by_count[:top_n]],
    'n' : [row[i] for row in tf.toarray() for i in sort_by_count[:top_n]]    
})

mapper = LinearColorMapper(palette=colors, low=0, high=tf.max())

p2.rect('ch', 'word', 1, 0.8, color=transform('n', mapper), source=s, 
        hover_fill_color="#666666", hover_line_color=None)

p2.add_tools(HoverTool(tooltips="""
    <div style="max-width: 450px;">
        <div>"@word" occurred @n times in chapter @ch{0}.</div>
    </div>
    """, mode="mouse"))

output_file('output/most_common_words_per_chapter.html')

show(p2)
    

In [12]:
import string
from collections import Counter

punct = {}

for i in range(len(corpus)):
    punct[i] = Counter([x for x in corpus[i] if x in string.punctuation]) 

In [15]:
from bokeh.core.properties import value
from bokeh.models import Legend

ch_by_punct = pd.DataFrame().from_dict(punct).T
ch_by_punct.index = ch_by_punct.index.map(lambda x: "CH. {}".format(x+1))
ch_by_punct = ch_by_punct.apply(lambda x: x / x.sum(), axis=1)
criteria = ch_by_punct.mean() > 0.01
ch_by_punct = ch_by_punct.iloc[:,criteria.values].fillna(0)

cols = ch_by_punct.columns.tolist()

p3 = figure(plot_width=960, plot_height=400, tools="", title="",
           y_range=(0, 1), x_range=ch_by_punct.index.tolist())

p3.toolbar_location = None
p3.xaxis.major_label_orientation = 45
p3.yaxis.axis_line_alpha = 0
p3.axis.major_label_text_font = 'Menlo'
p3.axis.major_label_text_color = '#666666'
p3.axis.minor_tick_out = None
p3.axis.major_tick_out = None

s3 = ColumnDataSource(ch_by_punct)

p3.vbar_stack(cols, x='index', width=0.9, source=s3, fill_color=Spectral[len(cols)], 
              line_color=None, legend=[value(x) for x in cols])

p3.legend.orientation = "horizontal"


output_file("output/punctuation.html")
show(p3)



In [19]:
n_words = [len(x.split(" ")) for x in corpus]
n_stops = [len(re.split(r"(?!\.)\b\s[\"a-zA-Z]*\.", x)) for x in corpus]

p4 = figure(plot_width=960, plot_height=400, tools=[],
    x_range=["CH. {}".format(i+1) for i in range(len(corpus))],
    y_range=(0,40),
    title="Words per sentence per chapter"
)

p4.toolbar_location = None
p4.xaxis.major_label_orientation = 45
p4.yaxis.axis_line_alpha = 0
p4.axis.major_label_text_font = 'Menlo'
p4.axis.major_label_text_color = '#666666'
p4.axis.minor_tick_out = None
p4.axis.major_tick_out = None
p4.xgrid.grid_line_color = None

s4 = ColumnDataSource({
    'ch' : ["CH. {}".format(i+1) for i in range(len(corpus))],
    'pct' : [n_words[i] / float(n_stops[i]) for i in range(len(corpus))],
    'excerpt' : [" ".join(i[500:800].split(" ")[1:-1]) for i in corpus],
    'ch_title' : [c[0] for c in chapter_lines],
})
p4.rect('ch', 1, 0.9, 'pct', fill_color=colors[-3], line_color=None, source=s4)

p4.add_tools(HoverTool(tooltips="""
    <div style="max-width: 450px;">
        <div style="font-weight: bold;">@ch_title</div>
        <div style="font-style: italic">Excerpt: "@excerpt ...”</div>
    </div>
    """, anchor="top_right", point_policy="follow_mouse"))

output_file("output/word_sentence_per_chapter.html")

show(p4)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(min_df=5, max_df=0.8, stop_words='english', ngram_range=(1,2))
tfidf = tfidf_vect.fit_transform(corpus)
print(tfidf.shape)

excludes = ['bee', 'wee', 'hee', 'doe'] #exclude some 'ancient' stop words
use_vocab = [x for x in tfidf_vect.get_feature_names() if x not in excludes]

tfidf_vect = TfidfVectorizer(min_df=5, max_df=0.8, stop_words='english', ngram_range=(1,2), vocabulary=use_vocab)
tfidf = tfidf_vect.fit_transform(corpus)
print(tfidf.shape)

(47, 2338)
(47, 2334)


In [32]:
vocab_tfidf = tfidf_vect.get_feature_names()


part1_words = [vocab_tfidf[i] for i in \
            np.argsort(tfidf[:30,:].sum(axis=0))[:,::-1].tolist()[0][:25]]
print("Top 10 words first 30 chapters:\n", part1_words[:10])
part2_words = [vocab_tfidf[i] for i in \
            np.argsort(tfidf[31:tfidf.shape[0],:].sum(axis=0))[:,::-1].tolist()[0][:25]]
print("\nTop 10 words chapter 31 and onward:\n", part2_words[:10])
part12_words = [x for x in part1_words if x in part2_words]
print("\nTop words in both parts:\n", part12_words)

Top 10 words first 30 chapters:
 ['law', 'soveraign', 'common wealth', 'wealth', 'lawes', 'assembly', 'good', 'covenant', 'consequences', 'publique']

Top 10 words chapter 31 and onward:
 ['moses', 'kingdome', 'church', 'christ', 'spirit', 'scripture', 'king', 'saviour', 'holy', 'christian']

Top words in both parts:
 ['authority']


In [33]:
total_word_mean = tfidf.mean(axis=0).tolist()[0]
sort_by_score = np.argsort(tfidf.sum(axis=0))[:,::-1].tolist()[0]

top_n = 50

top_n_words = [vocab_tfidf[i] for row in tfidf for i in sort_by_score[:top_n]]
top_word_color = {x : '#aaaaaa' for x in top_n_words}
top_word_color.update({x : '#0062FF' for x in part1_words})
top_word_color.update({x : '#FFA200' for x in part2_words})
top_word_color.update({x : '#aaaaaa' for x in part12_words})

In [34]:
p5 = figure(plot_width=960, plot_height=700, tools=[],
            x_range=[str(x+1) for x in range(len(chapter_lines))],
            y_range=list(reversed([vocab_tfidf[i] for i in sort_by_score[:top_n]])),
           title="Highest scoring words per chapter in Thomas Hobbes' Leviathan")

p5.toolbar_location = None
p5.yaxis.axis_line_alpha = 0
p5.axis.major_label_text_color = '#666666'
p5.axis.minor_tick_out = None
p5.axis.major_tick_out = None
p5.axis.major_tick_in = None
p5.axis.major_label_text_font = "Menlo"
p5.xgrid.grid_line_color = None
p5.ygrid.grid_line_color = None



s5 = ColumnDataSource({
    'ch' : [x+0.5 for x in range(len(chapter_lines)) for i in sort_by_score[:top_n]],
    'word' : [vocab_tfidf[i] for row in tfidf for i in sort_by_score[:top_n]],
    'c' : [top_word_color[vocab_tfidf[i]] for row in tfidf for i in sort_by_score[:top_n]],
    'n' : [row[i]**0.5 for row in tfidf.toarray() for i in sort_by_score[:top_n]]    
})

mapper = LinearColorMapper(palette=colors, low=0, high=tfidf.max())

p5.rect('ch', 'word', 1, 1, color='c', alpha='n', source=s5, 
        hover_fill_color='c', hover_line_color='white', line_color=None)

p5.add_tools(HoverTool(tooltips="""
    <div style="max-width: 450px;">
        <div>"@word" scored @n{0.000} in chapter @ch{0}.</div>
    </div>
    """, mode="mouse", anchor="top_left", point_policy='snap_to_data'))

output_file('output/highest_score_by_place_of_occurence.html')
show(p5)

In [57]:
#bokeh options
p = figure(width=960, height=600, tools="")
p.toolbar_location = None
p.yaxis.axis_line_alpha = 0
p.yaxis.axis_label_text_font = 'Menlo'
p.axis.major_label_text_color = 'grey'
p.axis.minor_tick_out = None
p.axis.major_tick_out = None


s_mean_sum = ColumnDataSource(data={
    'std' : [tfidf.toarray().std(axis=0).tolist()[i] for i in sort_by_score[:50000]],
    'mean' : [tfidf.mean(axis=0).tolist()[0][i] for i in sort_by_score[:50000]],
    'word' : [vocab_tfidf[i] for i in sort_by_score[:50000]]
})

p.circle(x='std', y='mean', source=s_mean_sum, size=7, fill_color='#0062FF', alpha=0.8, line_color=None)

p.add_tools(HoverTool(tooltips=[
    ('', '@word')
]))

show(p)



E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: mean [renderer: GlyphRenderer(id='d0aa0d8d-0494-49c2-8a7f-db81b70347a9', ...)]


In [51]:
tfidf.toarray().std(axis=0)

array([ 0.01207731,  0.01242986,  0.01567354, ...,  0.00679739,
        0.00922964,  0.00830628])