## Building a Robot Judge
### 3. Text Data Essentials

In [1]:
# Setup
import warnings; warnings.simplefilter('ignore')
# set this to your working directory
import pandas as pd
df1 = pd.read_csv('death-penalty-cases.csv')

In [2]:
df1.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32567 entries, 0 to 32566
Data columns (total 7 columns):
court_id     32567 non-null object
author_id    18215 non-null float64
state        32567 non-null object
year         32567 non-null int64
dateFiled    32567 non-null object
citeCount    32567 non-null int64
snippet      32567 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 1.7+ MB


In [3]:
df1['court_id'].value_counts()

texapp             2577
texcrimapp         2380
fla                1927
cal                1310
ga                 1104
illappct           1077
pa                  930
miss                925
ill                 895
oklacrimapp         760
nc                  731
calctapp            723
tenncrimapp         723
alacrimapp          711
ohioctapp           611
ariz                552
wva                 545
ind                 515
la                  512
mo                  507
ark                 459
nysd                390
ala                 379
tenn                368
fladistctapp        359
nyed                325
sc                  323
nev                 294
pasuperct           291
wash                272
                   ... 
iasd                  7
akd                   6
wyd                   6
connsuperct           6
lamd                  6
vid                   5
oked                  5
oklaag                5
wiwd                  4
ilsd                  4
oknd            

In [4]:
df1[['year','citeCount']].hist()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f9b29f9ae10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f9b29cd4080>]],
      dtype=object)

In [5]:
###################################
# Iterating over documents in a dataframe
###################################
from txt_utils import process_document

processed = {} # empty python dictionary for processed data
# iterate over rows
for i, row in df1.iterrows():
    docid = i # make document identifier
    text = row['snippet']     # get text snippet
    document = process_document(text) # get sentences/tokens
    processed[docid] = document # add to dictionary    

In [6]:
###################################
# Iterating over documents in text files
###################################
# select all files in your directory
from glob import glob
fnames = glob('contracts/*txt') # selects files using wildcards

# iterate over files
for fname in fnames:
    docid = fname.split('/')[-1][:-4] # get docid from filename
    text = open(fname).read() # read file as string
    document = process_document(text) # get sentences/tokens
    processed[docid] = document # add to dictionary

In [7]:
###################################
# Saving data in python
###################################
# save as python pickle
pd.to_pickle(processed, 'processed_corpus.pkl')

In [8]:
# Merging Data-frames
#df_merged = pd.merge(df1,df2,on='id', how='left')

In [9]:
###################################
# Screen Scraping
###################################

import urllib # Python's module for accessing web pages
url = 'https://goo.gl/VRF8Xs' # shortened URL for court case
page = urllib.request.urlopen(url) # open the web page

html = page.read() # read web page contents as a string
print(html[:400])  # print first 400 characters
print(html[-400:]) # print last 400 characters
print(len(html))   # print length of string

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n  <meta charset="utf-8"/>\n  <meta http-equiv="Content-Language" content="en"/>\n  <meta name="language" content="en_us"/>\n  <meta name="viewport" content="width=device-width,initial-scale=1"/>\n\n  \n  <meta name="description" content="Opinion for People v. Germany, 674 P.2d 345 \xe2\x80\x94 Brought to you by Free Law Project, a non-profit dedicated to creating high qual'
b'ik.php?idsite=1"\n                    style="border:0;" alt=""/></p></noscript>\n  <!-- End Piwik Code -->\n\n<!--[if lt IE 10 ]>\n<script src="//ajax.googleapis.com/ajax/libs/chrome-frame/1.0.3/CFInstall.min.js"></script>\n<script>window.attachEvent(\'onload\',function(){CFInstall.check({mode:\'inline\', url:\'/bad-browser/\', cssText: \'width: 100%; height: 200px;\' })})</script>\n<![endif]-->\n</body>\n</html>\n'
78389


In [10]:
###################################
# HTML parsing
###################################

# Parse raw HTML
from bs4 import BeautifulSoup # package for parsing HTML
soup = BeautifulSoup(html, 'lxml') # parse html of web page
print(soup.title) # example usage: print title item

<title>People v. Germany, 674 P.2d 345 – CourtListener.com</title>


In [11]:
# extract text
text = soup.get_text() # get text (remove HTML markup)
lines = text.splitlines() # split string into separate lines
print(len(lines)) # print number of lines

546


In [12]:
lines = [line for line in lines if line != ''] # drop empty lines
print(len(lines)) # print number of lines
print(lines[:20]) # print first 20 lines

201
['People v. Germany, 674 P.2d 345 – CourtListener.com', 'Toggle navigation', 'About', 'FAQ', 'Tour', 'Donate', 'Sign in / Register', 'From Free Law Project, a 501(c)(3) non-profit.', 'Opinions\xa0', 'Advanced Search', 'Citation Look Up', 'RECAP Archive', 'Oral Arguments', 'Judges', 'Visualizations\xa0', 'Gallery', 'SCOTUS Networks', 'New Network', '\xa0Donate', 'Your Notes']


In [13]:
###################################
# Removing unicode characters
###################################

from unidecode import unidecode # package for removing unicode
fixed = unidecode('Visualizations\xa0') # example usage
print(fixed) # print cleaned string

Visualizations 


In [14]:
#############
# Translation
#############

from googletrans import Translator
translator = Translator()
lang = translator.detect('이 문장은 한글로 쓰여졌습니다.').lang
lang

AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
eng = translator.translate('이 문장은 한글로 쓰여졌습니다.',
                           src=lang,
                           dest='en')
eng.text

In [None]:
##########
# Exploring a Corpus
##########
df1 = df1[['state','snippet']]
# Number of documents
len(df1['snippet'])

In [None]:
# Number of label categories (e.g. states)
df1['state'].describe()

In [None]:
# Number of samples per class
df1['state'].value_counts()

In [None]:
# Words per sample
def get_words_per_sample(txt):
    return len(txt.split())
df1['num_words'] = df1['snippet'].apply(get_words_per_sample)
df1['num_words'].describe()

In [None]:
# Frequency distribution over words
from collections import Counter
freqs = Counter()
for i, row in df1.iterrows():
    freqs.update(row['snippet'].lower().split())
freqs.most_common()[:20]

In [None]:
# (Number of samples) / number of words per sample)
len(df1['snippet']) / df1['num_words'].mean()
# if this is above 1500, we will use the sequence representation recommended by Google

In [None]:
# Sentiment Analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
polarity = sid.polarity_scores(text)
polarity

In [None]:
dfs = df1.sample(frac=.2)
def get_sentiment(snippet):
    return sid.polarity_scores(snippet)['compound']
dfs['sentiment'] = dfs['snippet'].apply(get_sentiment)
dfs.sort_values('sentiment',inplace=True)
list(dfs[:2]['snippet'])