# Install Dependencies

In [1]:
!pip install sklearn
!pip install matplotlib
!pip install scrapy

!pip install fastText
!pip install spacy
!pip install nltk
!pip install tensorflow
!pip install tensorflow_hub

Collecting pickle
[31m  Could not find a version that satisfies the requirement pickle (from versions: )[0m
[31mNo matching distribution found for pickle[0m
Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Building wheels for collected packages: sklearn
  Running setup.py bdist_wheel for sklearn ... [?25ldone
[?25h  Stored in directory: /Users/lucmeng/Library/Caches/pip/wheels/76/03/bb/589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


# Import Dependencies

In [1]:
import numpy as np
import pickle
import csv
import json
import re
from pprint import pprint

from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import nltk
from nltk.corpus import stopwords
import fastText
import spacy
import tensorflow as tf
import tensorflow_hub as hub

import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from urllib.parse import urljoin

In [2]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Show Python version
import platform
platform.python_version()

'3.6.6'

# Crawling Data

## Get Offenders Info

### Setup a pipeline

In [3]:
class OffenderInfoWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('data/offender_info_results.json', 'w+')
    def close_spider(self, spider):
        self.file.close()
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

### Define the spider

In [4]:
class OffenderInfoSpider(scrapy.Spider):
    name = "OffenderInfo"
    start_urls = [
        'http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html'
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.OffenderInfoWriterPipeline': 1}
    }

    def parse(self, response):
        print('A response from %s just arrived!', response.url)
        sel = Selector(response)

        table = sel.xpath('//table[@class="tdcj_table indent"]/tr')
        for tr in table[1:]:
            url_info = urljoin(response.url, str(tr.xpath('td[2]/a/@href').extract_first()))
            url_stmt = urljoin(response.url, str(tr.xpath('td[3]/a/@href').extract_first()))
        
            yield {
                'first_name': tr.xpath('td[5]/text()').extract_first(),
                'last_name': tr.xpath('td[4]/text()').extract_first(),
                'age': tr.xpath('td[7]/text()').extract_first(),
                'date': tr.xpath('td[8]/text()').extract_first(),
                'race': tr.xpath('td[9]/text()').extract_first(),
                'country': tr.xpath('td[10]/text()').extract_first(),
                'info_link': url_info,
                'death_note_link': url_stmt
            }
        

### Start the crawler

In [5]:
process = CrawlerProcess()
process.crawl(OffenderInfoSpider)
process.start()

2018-11-25 17:02:51 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2018-11-25 17:02:51 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2p  14 Aug 2018), cryptography 2.3.1, Platform Darwin-17.7.0-x86_64-i386-64bit
2018-11-25 17:02:51 [scrapy.crawler] INFO: Overridden settings: {'LOG_LEVEL': 30}


<Deferred at 0x1a2e33cfd0>

A response from %s just arrived! http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html


In [6]:
file = open('data/offender_info_results.json', 'r')
lines = file.readlines()
line = lines[0]
obj = json.loads(line)
pprint([key for key in obj])

['first_name',
 'last_name',
 'age',
 'date',
 'race',
 'country',
 'info_link',
 'death_note_link']


## Extract Death Note

In [3]:
urls = []
objs = []

file = open('data/offender_info_results.json', 'r')
lines = file.readlines()
for line in lines:
    objs.append(json.loads(line))
    urls.append(objs[-1]['death_note_link'])

### Setup a pipeline

In [4]:
class DeathNoteWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('data/death_note_results.json', 'w+')
    def close_spider(self, spider):
        self.file.close()
    def process_it### Define the spiderem(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

### Define the spider

In [5]:
import logging
from scrapy.selector import Selector

class DeathNoteSpider(scrapy.Spider):
    name = "DeathNote"
    start_urls = urls
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.DeathNoteWriterPipeline': 1}
    }

    def parse(self, response):     
        sel = Selector(response)
    
        first = str(sel.xpath('//div[@id="content_right"]/p[6]/text()').extract_first()).strip()
        second = str(sel.xpath('//div[@id="content_right"]/p[7]/text()').extract_first()).strip()
        
        death_note = ''
        if first and first != 'Last Statement:' and first != 'None':
            death_note += first
        if second and second != 'None':
            death_note += second
            
        url = response.url
        obj = [o for o in objs if o['death_note_link'] == url][0]
        obj['death_note'] = death_note
    
        yield obj

### Start the crawler

In [6]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(DeathNoteSpider())
process.start()

2018-11-25 17:03:21 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2018-11-25 17:03:21 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2p  14 Aug 2018), cryptography 2.3.1, Platform Darwin-17.7.0-x86_64-i386-64bit
2018-11-25 17:03:21 [scrapy.crawler] INFO: Overridden settings: {'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


<Deferred at 0x1a2e821320>

# Data Cleaning

In [None]:
file = open('data/offender_info_results.json', 'r')
lines = file.readlines()
objs = []
for line in lines:
    objs.append(json.loads(line))

## String Format Converting

In [None]:
for i in range(len(objs)):
    note = objs[i]['death_note']
    if len(note.strip()) > 0:
        note = (note.encode('ascii', 'ignore')).decode("utf-8")
        objs[i]['death_note'] = note

## Remove Punctuations and Stopwords

In [None]:
stop_words = set(stopwords.words('english')) - set('not')
rm_punc = re.compile('[^a-zA-Z]')

In [None]:
objs_clean = []
death_notes_clean = []
for i in range(len(objs)):
    note = objs[i]['death_note']
    note = rm_punc.sub(' ', note)
    words = []
    tokens = note.split()
    for token in tokens:
        if token and token not in stop_words and token.lower() not in stop_words \
        and token.lower() != 'none':
            words.append(token)
    
    if words:
        note = ' '.join(words)
        death_notes_clean.append(note)
        obj[i]['death_note'] = note
        objs_clean.append(obj[i])

In [None]:
with open('data/death_note_clean.pickle', 'wb') as handle:
    pickle.dump(objs_clean, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
len(death_notes_clean)

## Lemmatization

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
def clean(text):
    doc = nlp(text)
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    return ' '.join(tokens)

In [None]:
objs_lemma = []
death_notes_lemma = []
for i in range(len(objs_clean)):
    obj = objs_clean[i]
    
    note = clean(obj['death_note'])
    death_notes_lemma.append(note)
    obj['death_note'] = note
    objs_lemma.append(obj)

In [None]:
with open('data/death_notes_lemma.pickle', 'wb') as handle:
    pickle.dump(objs_lemma, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Bag-of-Words

In [None]:
bag_of_words = {}
statement_word_count = {}
for s_id in range(len(last_words)):
    words_list = last_words.iloc[s_id]['LastStatement'].split()
    statement_word_count[s_id] = len(words_list)
    bag_of_words[s_id] = {}
    for word in words_list:
        if word in bag_of_words[s_id]:
            bag_of_words[s_id][word] = bag_of_words[s_id][word] +1
        else:
            bag_of_words[s_id][word] = 1

for s_id in bag_of_words:
    sorted_x = dict(sorted(bag_of_words[s_id].items(), key=lambda kv: kv[1], reverse=True))
    bag_of_words[s_id] = sorted_x

# Word Embedding

# Clustering

In [None]:
death_notes = death_notes_lemma
# death_notes = death_notes_clean

## Get notes in each clsuter

In [None]:
note_clusters = {}
for c in set(clusters):
    note_clusters[c] = []
    for i, txt in enumerate(death_notes):
        if clusters[i] == c:
            note_clusters[c].append(txt)

# Results Analysis

## Clustering Visualization

# Top K Words