# Install Dependencies

In [None]:
!pip install sklearn
!pip install matplotlib
!pip install scrapy

!pip install fastText
!pip install spacy
!pip install nltk
!pip install tensorflow
!pip install tensorflow_hub
!python -m spacy download en_core_web_md

# Import Dependencies

In [1]:
import numpy as np
import pickle
import csv
import json
import re
import math
from heapq import heappush, heappop
from scipy.sparse import csr_matrix
from pprint import pprint
from time import time

from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fastText
import spacy
import tensorflow as tf
import tensorflow_hub as hub
from gensim.models import KeyedVectors

import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from urllib.parse import urljoin

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucmeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Show Python version
import platform
platform.python_version()

'3.6.6'

# Crawling Data

## Get Offenders Info

### Setup a pipeline

In [9]:
class OffenderInfoWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('data/offender_info_results.json', 'w+')
    def close_spider(self, spider):
        self.file.close()
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

### Define the spider

In [10]:
class OffenderInfoSpider(scrapy.Spider):
    name = "OffenderInfo"
    start_urls = [
        'http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html'
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.OffenderInfoWriterPipeline': 1}
    }

    def parse(self, response):
        print('A response from %s just arrived!', response.url)
        sel = Selector(response)

        table = sel.xpath('//table[@class="tdcj_table indent"]/tr')
        for tr in table[1:]:
            url_info = urljoin(response.url, str(tr.xpath('td[2]/a/@href').extract_first()))
            url_stmt = urljoin(response.url, str(tr.xpath('td[3]/a/@href').extract_first()))
        
            yield {
                'first_name': tr.xpath('td[5]/text()').extract_first(),
                'last_name': tr.xpath('td[4]/text()').extract_first(),
                'age': tr.xpath('td[7]/text()').extract_first(),
                'date': tr.xpath('td[8]/text()').extract_first(),
                'race': tr.xpath('td[9]/text()').extract_first(),
                'country': tr.xpath('td[10]/text()').extract_first(),
                'info_link': url_info,
                'death_note_link': url_stmt
            }
        

### Start the crawler

In [11]:
process = CrawlerProcess()
process.crawl(OffenderInfoSpider)
process.start()

2018-11-27 15:52:44 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2018-11-27 15:52:44 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2p  14 Aug 2018), cryptography 2.3.1, Platform Darwin-17.7.0-x86_64-i386-64bit
2018-11-27 15:52:44 [scrapy.crawler] INFO: Overridden settings: {'LOG_LEVEL': 30}


<Deferred at 0x1a4718ca58>

A response from %s just arrived! http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html


In [12]:
file = open('data/offender_info_results.json', 'r')
lines = file.readlines()
line = lines[0]
obj = json.loads(line)
pprint([key for key in obj])

['first_name',
 'last_name',
 'age',
 'date',
 'race',
 'country',
 'info_link',
 'death_note_link']


## Extract Death Note

In [3]:
urls = []
objs = []

file = open('data/offender_info_results.json', 'r')
lines = file.readlines()
for line in lines:
    objs.append(json.loads(line))
    urls.append(objs[-1]['death_note_link'])

### Setup a pipeline

In [4]:
class DeathNoteWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('data/death_note_results.json', 'w+')
    def close_spider(self, spider):
        self.file.close()
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

### Define the spider

In [5]:
class DeathNoteSpider(scrapy.Spider):
    name = "DeathNote"
    start_urls = urls
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.DeathNoteWriterPipeline': 1}
    }

    def parse(self, response):     
        sel = Selector(response)
    
        first = str(sel.xpath('//div[@id="content_right"]/p[6]/text()').extract_first()).strip()
        second = str(sel.xpath('//div[@id="content_right"]/p[7]/text()').extract_first()).strip()
        
        death_note = ''
        if first and first != 'Last Statement:' and first != 'None':
            death_note += first
        if second and second != 'None':
            death_note += second
            
        url = response.url
        obj = [o for o in objs if o['death_note_link'] == url][0]
        obj['death_note'] = death_note
    
        yield obj

### Start the crawler

In [6]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(DeathNoteSpider())
process.start()

2018-11-27 15:53:12 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2018-11-27 15:53:12 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2p  14 Aug 2018), cryptography 2.3.1, Platform Darwin-17.7.0-x86_64-i386-64bit
2018-11-27 15:53:12 [scrapy.crawler] INFO: Overridden settings: {'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


<Deferred at 0x1a458267b8>

In [7]:
file = open('data/death_note_results.json', 'r')
lines = file.readlines()
line = lines[0]
obj = json.loads(line)
pprint([key for key in obj])

['first_name',
 'last_name',
 'age',
 'date',
 'race',
 'country',
 'info_link',
 'death_note_link',
 'death_note']


# Data Processing

## Data Cleaning

In [8]:
file = open('data/death_note_results.json', 'r')
lines = file.readlines()
objs = []
for line in lines:
    objs.append(json.loads(line))

### String Format Converting

In [9]:
for i in range(len(objs)):
    note = objs[i]['death_note']
    if len(note.strip()) > 0:
        note = (note.encode('ascii', 'ignore')).decode("utf-8")
        objs[i]['death_note'] = note

### Remove Punctuations and Stopwords

In [10]:
stop_words = set(stopwords.words('english')) - set('not')
rm_punc = re.compile('[^a-zA-Z]')

In [11]:
objs_clean = []
death_notes_clean = []
for i in range(len(objs)):
    note = objs[i]['death_note']
    note = rm_punc.sub(' ', note)
    words = []
    tokens = note.split()
    for token in tokens:
        if token and token not in stop_words and token.lower() not in stop_words \
        and token.lower() != 'none':
            words.append(token)
    
    if words:
        note = ' '.join(words)
        death_notes_clean.append(note)
        objs[i]['death_note'] = note
        objs_clean.append(objs[i])

In [12]:
with open('data/death_notes_clean.pickle', 'wb') as handle:
    pickle.dump(objs_clean, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
len(death_notes_clean)

449

### Lemmatization

In [14]:
nlp = spacy.load('en_core_web_lg')

In [15]:
def clean(text):
    doc = nlp(text)
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    return ' '.join(tokens)

In [16]:
objs_lemma = []
death_notes_lemma = []
for i in range(len(objs_clean)):
    obj = objs_clean[i]
    
    note = clean(obj['death_note'])
    death_notes_lemma.append(note)
    obj['death_note'] = note
    objs_lemma.append(obj)

In [17]:
len(death_notes_lemma)

449

In [18]:
with open('data/death_notes_lemma.pickle', 'wb') as handle:
    pickle.dump(objs_lemma, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Bag-of-Words

In [None]:
# per doc
bag_of_words = {}
statement_word_count = {}
for s_id in range(len(death_notes_lemma)):
    words_list = death_notes_lemma[s_id]['death_note'].split()
    statement_word_count[s_id] = len(words_list)
    bag_of_words[s_id] = {}
    for word in words_list:
        if word in bag_of_words[s_id]:
            bag_of_words[s_id][word] = bag_of_words[s_id][word] +1
        else:
            bag_of_words[s_id][word] = 1

for s_id in bag_of_words:
    sorted_x = dict(sorted(bag_of_words[s_id].items(), key=lambda kv: kv[1], reverse=True))
    bag_of_words[s_id] = sorted_x

In [None]:
# for all doc
statementFreqDict = {}

for s_id in bag_of_words:
    for word in bag_of_words[s_id]:
        if word in statementFreqDict:
            statementFreqDict[word] = statementFreqDict[word] + bag_of_words[s_id][word]
        else:
            statementFreqDict[word] = bag_of_words[s_id][word]
        
sorted_x = dict(sorted(statementFreqDict.items(), key=lambda kv: kv[1], reverse=True))
statementFreqDict = sorted_x


## Claculating TF-IDF

In [None]:
def tf(ngram, doc_id, ngram_count, ngramDocDict):
    return ngramDocDict[doc_id][ngram] / ngram_count[doc_id]

def n_containing(ngram, ngramDocDict):
    return sum(1 for doc_id in ngramDocDict if ngram in ngramDocDict[doc_id])

def idf(ngram, ngramDocDict):
    return math.log(len(ngramDocDict) / (n_containing(ngram, ngramDocDict)))

def tfidf(ngram, doc_id, ngram_count, ngramDocDict):
    return tf(ngram, doc_id, ngram_count, ngramDocDict) * idf(ngram, ngramDocDict)

In [None]:
def cal_tfidf_doc(ngramDocDict):
    ngram_statement_tfidf = {}
    for s_id in ngramDocDict:
        ngram_statement_tfidf[s_id] = {}

        for ngram in ngramDocDict[s_id]:
            ngram_statement_tfidf[s_id][ngram] = tfidf(ngram, s_id, statement_word_count, ngramDocDict)
            
    return ngram_statement_tfidf

In [None]:
def tfidf_encoding(ls):
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(ls)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    return X_train_tf

In [None]:
statement_tfidf = cal_tfidf_doc(bag_of_words)

## Get Top 10 TF-IDF

In [None]:
heap = []
for s_id in statement_tfidf:
    for word in statement_tfidf[s_id]:
        heappush(heap, (statement_tfidf[s_id][word], word))
        
        if len(heap) > 10:
            heappop(heap)

In [None]:
while(heap):
    r = heappop(heap)
    print(r[1], r[0])

# Word Embedding

In [None]:
death_notes = death_notes_lemma
#death_notes = death_notes_clean

## Using TFIDF 

In [None]:
last_words = []
for s_id in range(len(death_notes)):
    last_words.append(death_notes_lemma[s_id]['death_note'])
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(last_words)

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf = csr_matrix.todense(X_train_tf)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf = csr_matrix.todense(X_train_tfidf)

## Google W2V
* trained on Google News corpus
* model download link: https://code.google.com/archive/p/word2vec/

In [None]:
# loading Google News model
filename = 'embedding/GoogleNews-vectors-negative300.bin'
google_model = KeyedVectors.load_word2vec_format(filename, binary=True)
vocab = google_model.vocab.keys()

In [None]:
google_embedding_avg = [] 
google_embedding_tfidf = []
word_vect_list = [] # list storing only vectors of the words
word_not_vocab = [] # list storing words which are nt in the Google news vocab

for s_id in range(len(death_notes_lemma)): 
    words_list = death_notes_lemma[s_id]['death_note'].split()
    vec = 0
    count = 0
    vec_tfidf = 0
    tfidf_total = 0
    for word in words_list:
        if word in vocab:
            vec = vec + google_model[word]
            vec_tfidf += google_model[word] * statement_tfidf[s_id][word]
            tfidf_total += statement_tfidf[s_id][word]
            count += 1
        else:
            word_not_vocab.append(word)
    google_embedding_avg.append(vec/count)
    google_embedding_tfidf.append(vec_tfidf/tfidf_total)

In [None]:
embed_file = "data/google_avg_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(google_embedding_avg, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [None]:
embed_file = "data/google_weighted_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(google_embedding_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SpaCy W2V
* trained on written web text (blogs, news, comments)
* model link: https://spacy.io/models/

In [None]:
nlp_md = spacy.load('en_core_web_md')
spacy_embedding = []
for s_id in range(len(death_notes_lemma)): 
    doc = nlp_md(death_notes_lemma[s_id]['death_note'])
    spacy_embedding.append(doc.vector)

In [None]:
embed_file = "data/spacy_avg_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(spacy_embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# weighted average
spacy_embedding_tfidf = []
for s_id in range(len(death_notes_lemma)): 
    words_list = death_notes_lemma[s_id]['death_note'].split()
    vec_tfidf_spacy = 0
    tfidf_total = 0
    for word in words_list:
        doc = nlp_md(word)
        vec_tfidf_spacy += doc.vector * statement_tfidf[s_id][word]
        tfidf_total += statement_tfidf[s_id][word]
        spacy_embedding_tfidf.append(vec_tfidf_spacy/tfidf_total)

In [None]:
embed_file = "data/spacy_weighted_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(spacy_embedding_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Glove Embedding
* trained on Wikipedia 2014 + English Gigaword 5th
* model download link: http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
# load the Stanford GloVe model
filename = 'embedding/glove.6B.300d.txt.word2vec'
glove_model = KeyedVectors.load_word2vec_format(filename, binary=False)
glove_vocab = glove_model.vocab.keys()

In [None]:
glove_embedding_avg = [] #list to store words and their corresponding vectors
glove_embedding_tfidf = []
word_not_vocab_glove = [] # list storing words which are nt in the Google news vocab

for s_id in range(len(death_notes_lemma)): 
    words_list = death_notes_lemma[s_id]['death_note'].split()
    vec = 0
    count = 0
    vec_tfidf = 0
    tfidf_total = 0
    for word in words_list:
        if word in glove_vocab:
            vec = vec + glove_model[word]
            vec_tfidf += glove_model[word] * statement_tfidf[s_id][word]
            tfidf_total += statement_tfidf[s_id][word]
            count += 1
        else:
            word_not_vocab_glove.append(word)
    glove_embedding_avg.append(vec/count)
    glove_embedding_tfidf.append(vec_tfidf/tfidf_total)

In [None]:
embed_file = "data/glove_avg_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(glove_embedding_avg, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
embed_file = "data/glove_weighted_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(glove_embedding_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

## FastText
* trained on Common Crawl (600B tokens)
* model download link: https://fasttext.cc/docs/en/english-vectors.html

In [None]:
# load model
fasttext_model = fastText.load_model('embedding/crawl-300d-2M-subword.bin')

In [None]:
# sentence embedding
fasttext_embedding = []
for s_id in range(len(death_notes_lemma)):  
    fasttext_embedding.append(fasttext_model.get_sentence_vector(death_notes_lemma[s_id]['death_note']))

In [None]:
embed_file = "data/fastText_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(fasttext_embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
ft_embedding_avg = [] #dictionary to store words and their corresponding vectors
ft_embedding_tfidf = {}

for s_id in range(len(death_notes_lemma)):
    vec = 0
    c = 0
    vec_tfidf = 0
    tfidf_total = 0
    for word in word_tokenize(death_notes_lemma[s_id]['death_note']):
        vec = vec + fasttext_model.get_word_vector(word)
        vec_tfidf = vec_tfidf + fasttext_model.get_word_vector(word) * statement_tfidf[s_id][word]
        tfidf_total += statement_tfidf[s_id][word]
        c = c + 1
    ft_embedding_avg.append(vec/c)
    ft_embedding_tfidf.append(vec_tfidf/tfidf_total)


In [None]:
embed_file = "data/ft_avg_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(ft_embedding_avg, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
embed_file = "data/ft_weighted_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(ft_embedding_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Universal Sentence Encoder
* trained on Stanford Natural Language Inference (SNLI) corpus (570k human-written English sentence pairs)
* model link: https://tfhub.dev/google/universal-sentence-encoder/2

In [None]:
use_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
use_model = hub.Module(use_url)

In [None]:
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    use_embedding = session.run(use_model(death_notes))

In [None]:
embed_file = "data/use_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(use_embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Elmo
* trained on 1 Billion Word Benchmark
* model link: https://tfhub.dev/google/elmo/2

In [None]:
elmo_url = "https://tfhub.dev/google/elmo/2"
elmo_model = hub.Module(elmo_url)

In [None]:
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    elmo_embedding = session.run(elmo_model(death_notes))

In [None]:
embed_file = "data/elmo_embedding.pickle"
with open(embed_file, 'wb') as handle:
    pickle.dump(elmo_embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Clustering

In [None]:
embedding = X_train_tfidf
#embedding = google_embedding_avg
# embedding = google_embedding_tfidf
# embedding = glove_embedding_avg
# embedding = glove_embedding_tfidf
# embedding = spacy_embedding
# embedding = spacy_embedding_tfidf
# embedding = ft_embedding
# embedding = ft_embedding_avg
# embedding = ft_embedding_tfidf
# embedding = use_embedding
# embedding = elmo_embedding

In [None]:
print(np.array(embedding).shape)

## Elbow Method

In [None]:
from sklearn import metrics
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import numpy as np

maximum = 25

_ = plt.plot()
distortions = []
K = range(2, maximum+1)
for k in K:
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=2018)
    _ = kmeans.fit(embedding)
    _ = distortions.append(kmeans.inertia_)

# Plot the elbow
_ = plt.plot(K, distortions, 'bx-')
_ = plt.xlabel('k')
_ = plt.ylabel('Distortion')
_ = plt.title('The Elbow Method showing the optimal k')
plt.show()

## K-Mean

In [None]:
k = 5
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=2018).fit(embedding)
clusters = kmeans.predict(embedding)

## Get notes in each clsuter

In [None]:
note_clusters = {}
for c in set(clusters):
    note_clusters[c] = []
    for i, txt in enumerate(death_notes):
        if clusters[i] == c:
            note_clusters[c].append(txt)

# Results Analysis

## Clustering Visualization

In [None]:
arr = np.array(embedding)
tsne = TSNE(n_components=2, random_state=2018)
reduced = tsne.fit_transform(arr)       
t = np.array(reduced).transpose()

In [None]:
markers = ["x", "v", "o", "s", "*", ">", "<", "P", 
           '1', '2', '3', '4', 'h', "d", "|", "+"]
colors = ['darkorange', 'steelblue', 'limegreen',  'salmon', 'y',  'violet', 'c', 'tomato', 
          'rosybrown', 'brown', 'darkmagenta', 'pink', 'gold', "orange", "skyblue", "seagreen"]

fig, ax = plt.subplots(figsize=(10, 10))

for x, y, c in zip(t[0], t[1], clusters):
    _ = ax.scatter(x, y, c=colors[c], marker=markers[c])

types = []
for c in set(clusters):
    types.append(Line2D([], [], color=colors[c], marker=markers[c], label=c))

# for i, c in enumerate(clusters):
#     if c == 5:
#         _ = ax.annotate(doc_txt[i], (t[0][i], t[1][i]), fontsize=18)

_ = plt.legend(handles=types, loc='lower left')
plt.show()

## Top 10 words in each cluster

In [None]:
#per doc within cluster
bag_of_words = {}

for c_id in range(len(note_clusters)):
    bag_of_words[c_id] = {}
    
    for s_id in range(len(note_clusters[c_id])):
        words_list = note_clusters[c_id][s_id]['death_note'].split()
        bag_of_words[c_id][s_id] = {}
        for word in words_list:
            if word in bag_of_words[c_id][s_id]:
                bag_of_words[c_id][s_id][word] = bag_of_words[c_id][s_id][word] +1
            else:
                bag_of_words[c_id][s_id][word] = 1

for c_id in bag_of_words:
    for s_id in bag_of_words[c_id]:
        sorted_x = dict(sorted(bag_of_words[c_id][s_id].items(), key=lambda kv: kv[1], reverse=True))
        bag_of_words[c_id][s_id] = sorted_x

In [None]:
# for all doc within each cluster

statementFreqDict = {}
for c_id in bag_of_words:
    statementFreqDict[c_id] = {}
    for s_id in bag_of_words[c_id]:
        for word in bag_of_words[c_id][s_id]:
            if word in statementFreqDict[c_id]:
                statementFreqDict[c_id][word] = statementFreqDict[c_id][word] + bag_of_words[c_id][s_id][word]
            else:
                statementFreqDict[c_id][word] = bag_of_words[c_id][s_id][word]

for c_id in bag_of_words:
    sorted_x = dict(sorted(statementFreqDict[c_id].items(), key=lambda kv: kv[1], reverse=True))
    statementFreqDict[c_id] = sorted_x

In [None]:
statementFreqDict

# Topic Modeling

In [38]:
n_features = 2600
n_topics = 5
n_top_words = 20

In [20]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % (topic_idx+1), ", ".join([feature_names[i] + "(" + str(topic[i]) + ")" for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [36]:
def print_top_words_f(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % (topic_idx+1))
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            print(feature_names[i] + "\t", int(topic[i]*100))
        print()
    print()

In [22]:
def get_top_words(model, feature_names, n_top_words):
    top_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return top_words

In [39]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(death_notes_lemma)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.021s.


In [40]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features, n_features=%d..." % n_features)
t0 = time()
nmf_model = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

# print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words_f(nmf_model, tfidf_feature_names, n_top_words)

Fitting the NMF model with tf-idf features, n_features=2600...
done in 0.055s.
Topic #1:
love	 188
know	 76
tell	 70
ya	 65
want	 62
strong	 40
everybody	 38
care	 32
say	 31
kill	 28
stay	 27
mom	 25
kid	 23
good	 22
family	 20
yes	 20
life	 20
man	 18
let	 18
appreciate	 17

Topic #2:
sorry	 165
family	 62
hope	 48
pain	 47
cause	 45
say	 39
like	 35
bring	 34
wish	 25
hurt	 25
know	 25
victim	 24
change	 21
year	 21
apologize	 20
forgive	 20
closure	 19
mr	 16
happen	 15
truly	 14

Topic #3:
statement	 141
decline	 85
offender	 83
make	 54
write	 19
final	 9
speak	 7
point	 2
innocent	 2
say	 1
hold	 1
bobby	 0
capital	 0
prepare	 0
warden	 0
talk	 0
portion	 0
omit	 0
girl	 0
gladly	 0

Topic #4:
thank	 171
family	 80
like	 68
friend	 50
warden	 48
love	 48
support	 43
yes	 39
ready	 24
jesus	 22
sir	 21
help	 19
sister	 17
apologize	 15
strong	 15
hope	 13
victim	 13
year	 13
wife	 12
brother	 11

Topic #5:
forgive	 95
god	 88
lord	 73
peace	 65
ask	 47
pray	 44
jesus	 42
heart	 3