In [1]:
import utils
# Setup
import warnings; warnings.simplefilter('ignore')
# set this to your working directory
WORKING_DIR = '/home/elliott/Dropbox/_Ash_Teaching/2018-09 - Bocconi - Text Data and ML/code'
import os
os.chdir(WORKING_DIR)
import pandas as pd
df1 = pd.read_csv('death-penalty-cases.csv')
%matplotlib notebook

In [2]:
###################################
# Screen Scraping
###################################

import urllib # Python's module for accessing web pages
url = 'https://goo.gl/VRF8Xs' # shortened URL for court case
page = urllib.request.urlopen(url) # open the web page

html = page.read() # read web page contents as a string
print(html[:400])  # print first 400 characters
print(html[-400:]) # print last 400 characters
print(len(html))   # print length of string

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n  <meta charset="utf-8"/>\n  <meta http-equiv="Content-Language" content="en"/>\n  <meta name="language" content="en_us"/>\n  <meta name="viewport" content="width=device-width,initial-scale=1"/>\n\n  \n  <meta name="description" content="Opinion for People v. Germany, 674 P.2d 345 \xe2\x80\x94 Brought to you by Free Law Project, a non-profit dedicated to creating high qual'
b'ik.php?idsite=1"\n                    style="border:0;" alt=""/></p></noscript>\n  <!-- End Piwik Code -->\n\n<!--[if lt IE 10 ]>\n<script src="//ajax.googleapis.com/ajax/libs/chrome-frame/1.0.3/CFInstall.min.js"></script>\n<script>window.attachEvent(\'onload\',function(){CFInstall.check({mode:\'inline\', url:\'/bad-browser/\', cssText: \'width: 100%; height: 200px;\' })})</script>\n<![endif]-->\n</body>\n</html>\n'
77663


In [3]:
#############
# Translation
#############

from googletrans import Translator
translator = Translator()
lang = translator.detect('이 문장은 한글로 쓰여졌습니다.').lang
lang

'ko'

In [4]:
eng = translator.translate('이 문장은 한글로 쓰여졌습니다.',
                           src=lang,
                           dest='en')
eng.text

'This sentence was written in Korean.'

In [5]:
###################################
# HTML parsing
###################################

# Parse raw HTML
from bs4 import BeautifulSoup # package for parsing HTML
soup = BeautifulSoup(html, 'lxml') # parse html of web page
print(soup.title) # example usage: print title item

<title>People v. Germany, 674 P.2d 345 – CourtListener.com</title>


In [6]:
# extract text
text = soup.get_text() # get text (remove HTML markup)
lines = text.splitlines() # split string into separate lines
print(len(lines)) # print number of lines

534


In [7]:
lines = [line for line in lines if line != ''] # drop empty lines
print(len(lines)) # print number of lines
print(lines[:20]) # print first 20 lines

199
['People v. Germany, 674 P.2d 345 – CourtListener.com', 'Toggle navigation', 'About', 'FAQ', 'Tour', 'Donate', 'Sign in / Register', 'From Free Law Project, a 501(c)(3) non-profit.', 'Opinions', 'RECAP Archive', 'Oral Arguments', 'Judges', 'Visualizations\xa0', 'Gallery', 'SCOTUS Networks', 'New Network', 'Donate', 'Your Notes', '                    (edit)', '                     ']


In [8]:
###################################
# Removing unicode characters
###################################

from unidecode import unidecode # package for removing unicode
fixed = unidecode('Visualizations\xa0') # example usage
print(fixed) # print cleaned string

Visualizations 


In [9]:
##########
# Exploring a Corpus
##########
df1 = df1[['state','snippet']]
# Number of documents
len(df1['snippet'])

32567

In [10]:
# Number of label categories (e.g. states)
df1['state'].describe()

count     32567
unique       55
top          TX
freq       5256
Name: state, dtype: object

In [11]:
# Number of samples per class
df1['state'].value_counts()

TX    5256
FL    2405
CA    2337
IL    2187
PA    1445
GA    1395
TN    1183
AL    1156
MS    1079
NY     958
NC     921
OH     857
OK     815
LA     799
MO     749
AZ     699
IN     570
WV     564
AR     530
WA     433
NJ     427
VA     427
MD     418
MA     394
SC     382
KY     376
NV     316
MI     301
DC     273
OR     263
ID     262
DE     257
CO     229
KS     212
UT     174
NE     162
CT     151
NM     138
IA     136
MT     129
ON     122
WY     114
WI     111
SD      87
ME      75
PR      51
MN      48
RI      43
HI      43
AK      32
VT      29
NH      28
ND      13
VI       5
KA       1
Name: state, dtype: int64

In [12]:
# Words per sample
def get_words_per_sample(txt):
    return len(txt.split())
df1['num_words'] = df1['snippet'].apply(get_words_per_sample)
df1['num_words'].describe()

count    32567.000000
mean        45.305923
std         30.635400
min          5.000000
25%         17.000000
50%         34.000000
75%         82.000000
max        121.000000
Name: num_words, dtype: float64

In [13]:
# Frequency distribution over words
from collections import Counter
freqs = Counter()
for i, row in df1.iterrows():
    freqs.update(row['snippet'].lower().split())
freqs.most_common()[:20]

[('the', 141618),
 ('death', 90159),
 ('penalty', 60719),
 ('of', 41969),
 ('to', 34120),
 ('in', 28431),
 ('a', 24591),
 ('and', 24492),
 ('that', 23523),
 ('for', 15641),
 ('penalty.', 15626),
 ('not', 14041),
 ('is', 13386),
 ('was', 12941),
 (';', 10853),
 ('&', 10576),
 (',', 9571),
 ('penalty,', 9136),
 ('court', 8926),
 ('be', 8333)]

In [14]:
# (Number of samples) / number of words per sample)
len(df1['snippet']) / df1['num_words'].mean()
# if this is above 1500, we will use the sequence representation recommended by Google
# (see lecture 16)

718.8243328602663