# Data compiling

This notebook lives in the folder where all the data is stored.  It takes the html files and parses them looking for plaintiff, defendant, and location data.  When it's done, it stores all the given information in a pandas dataframe.

In [31]:
import os
import re
import time
import pandas as pd
from collections import defaultdict

In [4]:
os.listdir()

['.DS_Store',
 '.ipynb_checkpoints',
 'census_data.csv',
 'Data prep (revised for long data).ipynb',
 'Housing',
 'Housing - Copy.zip',
 'housing downloader',
 "Processing Bill's data.ipynb",
 'Richards thesis.pdf',
 'thesis',
 'Thesis work']

In [8]:
os.getcwd()

'/Users/josephgpalin/Desktop/transfer/thesis'

In [10]:
if "Housing" not in os.getcwd():
    os.chdir("Housing")
os.listdir()[:20]

['.DS_Store',
 '01H77SP000080.html',
 '01H77SP002057.html',
 '01H77SP003928.html',
 '01H79SP000314.html',
 '01H79SP001985.html',
 '01H79SP002790.html',
 '01H79SP003298.html',
 '01H79SP003684.html',
 '01H83SP00148.html',
 '01H83SP02521.html',
 '01H83SP03328.html',
 '01H84SP000363.html',
 '01H84SP000684.html',
 '01H84SP000700.html',
 '01H84SP003575.html',
 '01H84SP003637.html',
 '01H84SP003898.html',
 '01H85SP002199.html',
 '01H85SP003707.html']

In [11]:
# get the first file name for parsing
first = os.listdir()[1]

In [12]:
first

'01H77SP000080.html'

In [13]:
with open(first,"r") as f:
    source = f.read()

In [14]:
# the source for the page.
source

'<!DOCTYPE html>\n<html>\n<!-- $Id: EServiceBaseLayoutPage.html,v 1.11 2015/07/17 21:22:42 zcarter Exp $ -->\n<head>\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>\n\t<meta http-equiv="Content-Type" content="text/html;charset=UTF-8"/>\n\t<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;">\n\t<title>Massachusetts Trial Court</title>\n\t<script type="text/javascript">window.cjsBrowserRenderStartTime = new Date();</script>\n\t<link rel="icon" type="image/png" href="../images/favicon.ico"/>\n\t<link rel="stylesheet" type="text/css" href="css/eaccess.css?v=1.18.05" media="screen"/>\n\t<link rel="stylesheet" type="text/css" href="css/initialize.css?v=1.18.05" media="screen"/>\n\t<link rel="stylesheet" type="text/css" href="css/eServ_fonts.css?v=1.18.05" media="screen"/>\n\t<link rel="stylesheet" type="text/css" href="css/eServ_header.css?v=1.18.05" media="screen"/>\n\t<link rel="stylesheet" type="text/css" href="css/eServ_application.css?v=

I need to extract the actual data.  Beautiful soup?

In [15]:
"Northeast Housing" in source

True

In [16]:
# Import beautiful soup
import bs4

# turn the html into soup we can parse
test = bs4.BeautifulSoup(source,"html.parser")

In [17]:
# Take a look at the soup.  It's not very useful
print(test.prettify())

<!DOCTYPE html>
<html>
 <!-- $Id: EServiceBaseLayoutPage.html,v 1.11 2015/07/17 21:22:42 zcarter Exp $ -->
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" name="viewport"/>
  <title>
   Massachusetts Trial Court
  </title>
  <script type="text/javascript">
   window.cjsBrowserRenderStartTime = new Date();
  </script>
  <link href="../images/favicon.ico" rel="icon" type="image/png"/>
  <link href="css/eaccess.css?v=1.18.05" media="screen" rel="stylesheet" type="text/css"/>
  <link href="css/initialize.css?v=1.18.05" media="screen" rel="stylesheet" type="text/css"/>
  <link href="css/eServ_fonts.css?v=1.18.05" media="screen" rel="stylesheet" type="text/css"/>
  <link href="css/eServ_header.css?v=1.18.05" media="screen" rel="stylesheet" type="text/css"/>
  <link href="css/eServ_application.css?v=1.18.05" media="screen" rel="styl

# Find the defendants

In [18]:
# Find our defendants

for element in test.find_all(attrs = {"class": "ptyType"}):
    if "Defendant" in element.text:
        print("Defendant: ",element.previous_sibling.strip())
    

Defendant:  Duong Au, Phat


# Find the house address

In [19]:
# Find the property address
location = test.find_all(attrs = {"class": "additionalCaseInfoPanel"})
for element in location:
    data = [thing.get_text().strip() for thing in element.find_all("span") if thing.get_text() != ""]  

tuple(data)

('Property Address', '279', '2nd Floor', 'Lawrence', 'MA', '01843')

# Find the court

In [20]:
for element in test.find_all("span"):
    if element.get_text().endswith("Court"):
        court = element.get_text()
        break
else:
    court = "Summary Proccess"

print(court)

Northeast Housing Court


# Putting it together:

Dependeng in if you're doing a test run or a real run, use an abbreviated piece of the data.  I meant to bring the 24MB condensed data with me, but unfortunately left that on my drive in Boston, so I'm going to work with a small subset.

In [21]:
%%time

collection = []

begin = 125000
end = 125100

for i,file in enumerate(os.listdir()[begin:end]):
    
    # only open html files
    if file.endswith(".html"):

        with open(file,"r") as f:
            source = f.read()
            soup = bs4.BeautifulSoup(source,"html.parser")

            # a list for tracking the defendants
            defs = []
            
            # store all the defendants
            for element in soup.find_all(attrs = {"class": "ptyType"}):
                if "Defendant" in element.text:
                    defs.append(element.previous_sibling.strip())

            # Find the property address
            location_data = soup.find_all(attrs = {"class": "additionalCaseInfoPanel"})
            for element in location_data:
                
                # extract the lines of the address
                location = tuple([thing.get_text().strip() for thing in element.find_all("span") if thing.get_text() != ""]  )

            # find the court district if available
            for element in soup.find_all("span"):
                if element.get_text().endswith("Court"):
                    court = element.get_text()
                    break
            # list "summary process if court not formally named"
            else:
                court = "Summary Process"

        # at the end, store the results
        to_add = [(i, defendant, location, court) for defendant in defs]
        collection.extend(to_add)

CPU times: user 6.41 s, sys: 173 ms, total: 6.58 s
Wall time: 7.06 s


7 seconds for 100 data points on my macbook air.

In [22]:
collection

[(0,
  'Mercurio, Danniel',
  ('Property Address',
   '73',
   'Riverlin',
   'Street',
   '2B',
   'Millbury',
   'MA',
   '01527'),
  'Worcester Housing Court'),
 (0,
  'Manna, Michael',
  ('Property Address',
   '73',
   'Riverlin',
   'Street',
   '2B',
   'Millbury',
   'MA',
   '01527'),
  'Worcester Housing Court'),
 (1,
  'Grandinetti, Anthony',
  ('Property Address',
   '73',
   'Riverlin',
   'Street',
   '2A',
   'Millbury',
   'MA',
   '01527'),
  'Worcester Housing Court'),
 (2,
  'Walker, Ricky Allen',
  ('Property Address',
   '66',
   'Laurel',
   'Street',
   '2nd floor',
   'Worcester',
   'MA',
   '01605'),
  'Worcester Housing Court'),
 (2,
  'Walker, Alfreda Frazier',
  ('Property Address',
   '66',
   'Laurel',
   'Street',
   '2nd floor',
   'Worcester',
   'MA',
   '01605'),
  'Worcester Housing Court'),
 (2,
  'Frazier, Shemeka',
  ('Property Address',
   '66',
   'Laurel',
   'Street',
   '2nd floor',
   'Worcester',
   'MA',
   '01605'),
  'Worcester Housing 

Note, late in the data set, we have proper addresses.  Early in the data set, we have street numbers and apartment numbers, but not full addresses.

Something I'm curious about: how many uniuqe addresses do we have.  The address lookup for census geocoding is slow.  So we should store any data we have along the way so we don't need to find it again.

Keep a dictionary with street addresses, and match them with census tracts.

In [29]:
# How many unique addresses in 100:
addresses = [x[2][1:] for x in collection]

In [33]:
unique_addresses = defaultdict(int)
for x in addresses:
    unique_addresses[x] += 1

len(unique_addresses.keys())

100

Not going to see a huge reduction in lookups with 100 addresses, but maybe with thousands?

# How much data do we have to process?

In [34]:
# If we process all the data:
len(os.listdir())

130409

# Run this on the full data

In [81]:
%%time

collection = []

for i,file in enumerate(os.listdir()):
    
    # only open html files
    if file.endswith(".html"):

        with open(file,"r") as f:
            source = f.read()
            soup = bs4.BeautifulSoup(source,"html.parser")

            # a list for tracking the defendants
            defs = []
            
            # store all the defendants
            for element in soup.find_all(attrs = {"class": "ptyType"}):
                if "Defendant" in element.text:
                    defs.append(element.previous_sibling.strip())

            # Find the property address
            location_data = soup.find_all(attrs = {"class": "additionalCaseInfoPanel"})
            for element in location_data:
                
                # extract the lines of the address
                location = tuple([thing.get_text().strip() for thing in element.find_all("span") if thing.get_text() != ""]  )

            # find the court district if available
            for element in soup.find_all("span"):
                if element.get_text().endswith("Court"):
                    court = element.get_text()
                    break
            # list "summary process if court not formally named"
            else:
                court = "Summary Process"

        # at the end, store the results
        to_add = [(i, defendant, location, court) for defendant in defs]
        collection.extend(to_add)
        
        
        if i %1000 == 0:
            print(i)
            
# Started at 1:38 PM.  Should finish around 2:10

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
Wall time: 49min 23s


## Lesson learned

If I process all the data in one large list, I see a huge hit on run time.  I should have split this up over multiple lists, and then combined them, or simply read them in chunks into a data frame.

In [82]:
len(collection)

169629

Expected that we should have mor ethan 130k entries as lots of residences will have multiple tenants.  Kinda surprised there weren't more people named on more eviction records.

In [83]:
collection[-10:]

[(130402,
  'Mason, Tracey',
  ('Property Address',
   '54',
   'Greendale',
   'Road',
   '2L',
   'Mattapan',
   'MA',
   '02126'),
  'Summary Process'),
 (130403,
  'Monteiro, Lanier',
  ('Property Address',
   '97',
   'Devon',
   'Street',
   '2',
   'Dorchester',
   'MA',
   '02121'),
  'Summary Process'),
 (130404,
  'Castiglioni, Jillian',
  ('Property Address', '25', 'Park', 'Street', 'Blackstone', 'MA', '01504'),
  'Worcester Housing Court'),
 (130404,
  'Beane, Carl',
  ('Property Address', '25', 'Park', 'Street', 'Blackstone', 'MA', '01504'),
  'Worcester Housing Court'),
 (130405,
  'Council, Letica',
  ('Property Address',
   '1301',
   'Main',
   'Street',
   '2',
   'Fitchburg',
   'MA',
   '01420'),
  'Summary Process'),
 (130406,
  'English, Gloria',
  ('Property Address', '251', 'Tacoma', 'Street', 'Worcester', 'MA', '01605'),
  'Worcester Housing Court'),
 (130406,
  'Mangabang, David',
  ('Property Address', '251', 'Tacoma', 'Street', 'Worcester', 'MA', '01605'),
 

In [86]:
import pickle as pkl

In [88]:
with open("compressed_records.pickle","wb") as f:
    pickle.dump(collection,f)

That should have pickled our data.  Let's test that we saved it by reloading it:

In [89]:
with open("compressed_records.pickle","rb") as f:
    new_collection = pickle.load(f)

len(new_collection)

169629

In [90]:
import sys

In [93]:
sys.getsizeof(new_collection)

1386112

In [94]:
new_collection[-10:]

[(130402,
  'Mason, Tracey',
  ('Property Address',
   '54',
   'Greendale',
   'Road',
   '2L',
   'Mattapan',
   'MA',
   '02126'),
  'Summary Process'),
 (130403,
  'Monteiro, Lanier',
  ('Property Address',
   '97',
   'Devon',
   'Street',
   '2',
   'Dorchester',
   'MA',
   '02121'),
  'Summary Process'),
 (130404,
  'Castiglioni, Jillian',
  ('Property Address', '25', 'Park', 'Street', 'Blackstone', 'MA', '01504'),
  'Worcester Housing Court'),
 (130404,
  'Beane, Carl',
  ('Property Address', '25', 'Park', 'Street', 'Blackstone', 'MA', '01504'),
  'Worcester Housing Court'),
 (130405,
  'Council, Letica',
  ('Property Address',
   '1301',
   'Main',
   'Street',
   '2',
   'Fitchburg',
   'MA',
   '01420'),
  'Summary Process'),
 (130406,
  'English, Gloria',
  ('Property Address', '251', 'Tacoma', 'Street', 'Worcester', 'MA', '01605'),
  'Worcester Housing Court'),
 (130406,
  'Mangabang, David',
  ('Property Address', '251', 'Tacoma', 'Street', 'Worcester', 'MA', '01605'),
 

Also save with pandas:

In [95]:
import pandas as pd

In [96]:
df = pd.DataFrame(new_collection)

In [97]:
df

Unnamed: 0,0,1,2,3
0,1,"Duong Au, Phat","(Property Address, 279, 2nd Floor, Lawrence, M...",Northeast Housing Court
1,2,"Jolly, Kerri","(Property Address, 34, #1, 1st Fl, Salem, MA, ...",Northeast Housing Court
2,2,"Staples, Richard","(Property Address, 34, #1, 1st Fl, Salem, MA, ...",Northeast Housing Court
3,3,"Milord, Luna","(Property Address, 3, #4, Lynn, MA, 01902)",Northeast Housing Court
4,4,"Figueroa, Alicia","(Property Address, 64, Apt. 1L, Springfield, M...",Western Housing Court
5,5,"Sanchez, Santos","(Property Address, 78-T, Springfield, MA, 01108)",Western Housing Court
6,6,"Rojas, Glicelia","(Property Address, 302, #2A, Holyoke, MA, 01040)",Western Housing Court
7,7,"Montanez, Marty","(Property Address, 25, Springfield, MA, 01107)",Western Housing Court
8,7,"Sieger, Ossana","(Property Address, 25, Springfield, MA, 01107)",Western Housing Court
9,8,"Auston, Sharon","(Property Address, 8, #1, Springfield, MA, 01109)",Western Housing Court


In [98]:
df.to_csv("housing_dataframe")

# Reloading the data

In [None]:
df = pd.read_csv("housing_dataframe")

In [399]:
%%time

# dictionary for saving data
my_dict = {}

# initialize a counter to see how many files were processed
count = 0

for name in os.listdir():
    
    # only process the html files.  Don't process
    # hidden files, .ipynb files, directories, et cetera
    if name.endswith(".html"):
        
        # in case you want to see how many files were processed
        count += 1
        
        # open and read the html
        with open(name,"r") as f:
            text = f.read()
            
        # turn the text into beautiful soup for data extraction
        soup = bs4.BeautifulSoup(text,"html.parser")
        
        # grab the fields we're currently interested in
        data = get_people_class_address(soup)
        
        # store the data by case number in our dictionary
        my_dict[data["case"]] = data
        
        new_file_name = data["case"] + ".html"
        
        # try to rename the file to match the case ID
        try:
            # rename if possible
            os.rename(name,new_file_name)
            
        except:
            # if the above errors, it's because the
            # case was processed in another file.
            
            # delete redundant files
            os.remove(name)

Wall time: 2.28 s


In [395]:
with open("test.html","w") as f:
    f.write("\n")

In [397]:
os.remove("test.html")

In [398]:
os.listdir()

['.ipynb_checkpoints',
 '18H84CV000528.html',
 '18H84CV000529.html',
 '18H84CV000530.html',
 '18H84CV000531.html',
 '18H84CV000532.html',
 '18H84CV000534.html',
 '18H84CV000535.html',
 '18H84CV000536.html',
 '18H84CV000537.html',
 '18H84CV000547.html',
 '18H84CV000548.html',
 '18H84CV000550.html',
 '18H84CV000551.html',
 '18H84CV000552.html',
 '18H84CV000553.html',
 '18H84CV000554.html',
 '18H84CV000559.html',
 '18H84CV000566.html',
 '18H84CV000567.html',
 '18H84CV000568.html',
 '18H84CV000570.html',
 '18H84CV000574.html',
 '18H84SP004018.html',
 '18H84SP004019.html',
 '18H84SP004020.html',
 '18H84SP004021.html',
 '18H84SP004022.html',
 '18H84SP004024.html',
 '18H84SP004026.html',
 '18H84SP004027.html',
 '18H84SP004028.html',
 '18H84SP004029.html',
 '18H84SP004030.html',
 '18H84SP004031.html',
 '18H84SP004032.html',
 '18H84SP004033.html',
 '18H84SP004035.html',
 '18H84SP004036.html',
 '18H84SP004037.html',
 '18H84SP004038.html',
 '18H84SP004040.html',
 '18H84SP004042.html',
 '18H84SP00

In [384]:
file_name

'18H84CV000530.html'

In [378]:
my_dict

{'18H84CV000528': {'Defendant': [('Forde', 'Lotlene')],
  'Plaintiff': [('Clarke', 'Sonia')],
  'case': '18H84CV000528',
  'city': 'Dorchester Center, MA, 02124',
  'close date': '09/14/2018',
  'file date': '09/04/2018',
  'street': '44 Stockton Street, 3'},
 '18H84CV000529': {'Defendant': [('Ramos Rodriguez', 'Maria')],
  'Plaintiff': [('Ramos', 'Angel'), ('Jin', 'Xiaxia')],
  'case': '18H84CV000529',
  'city': 'Roxbury Crossing, MA, 02120',
  'close date': '09/24/2018',
  'file date': '09/04/2018',
  'street': '22 Saint Alphonsus Street, A'},
 '18H84CV000530': {'Defendant': [('Zhong Xing Long USA Investments',)],
  'Plaintiff': [('Baez', 'Carmen')],
  'case': '18H84CV000530',
  'city': 'Chelsea, MA, 02150',
  'close date': '09/14/2018',
  'file date': '09/05/2018',
  'street': '944 Broadway, 3'},
 '18H84CV000531': {'Defendant': [('Boston Housing Authority',)],
  'Plaintiff': [('Allison', 'Rita')],
  'case': '18H84CV000531',
  'city': 'Dorchester Center, MA, 02124',
  'close date': '

# Pandas!

Now that we have data, we need to turn it into a dataframe:

In [379]:
import pandas as pd

In [380]:
df = pd.DataFrame.from_dict(my_dict,orient="index")
df.reset_index(drop=True, inplace=True)

In [381]:
df

Unnamed: 0,case,street,city,file date,close date,Defendant,Plaintiff
0,18H84CV000528,"44 Stockton Street, 3","Dorchester Center, MA, 02124",09/04/2018,09/14/2018,"[(Forde, Lotlene)]","[(Clarke, Sonia)]"
1,18H84CV000529,"22 Saint Alphonsus Street, A","Roxbury Crossing, MA, 02120",09/04/2018,09/24/2018,"[(Ramos Rodriguez, Maria)]","[(Ramos, Angel), (Jin, Xiaxia)]"
2,18H84CV000530,"944 Broadway, 3","Chelsea, MA, 02150",09/05/2018,09/14/2018,"[(Zhong Xing Long USA Investments,)]","[(Baez, Carmen)]"
3,18H84CV000531,"91 Ames Street, Box C 237","Dorchester Center, MA, 02124",09/05/2018,09/12/2018,"[(Boston Housing Authority,)]","[(Allison, Rita)]"
4,18H84CV000532,"1 Nazing Court, 14","Dorchester, MA, 02121",09/05/2018,09/18/2018,"[(Villalobos, Property Manager, Carmen)]","[(Deas, Cullen)]"
5,18H84CV000534,"9 Kenberma Road, 2 & 3","Dorchester Center, MA, 02124",09/05/2018,09/18/2018,"[(Lopes, Jose), (Lopes, Ricardo), (Lopes, Fran...","[(McDonald, Deneine)]"
6,18H84CV000535,"1049 Tremont Street, 42","Roxbury Crossing, MA, 02120",09/06/2018,09/18/2018,"[(Ayala, Angel)]","[(Arch Development LP,)]"
7,18H84CV000536,"38 Howard, Street","Cambridge, MA, 02139",09/06/2018,09/18/2018,"[(Murriel Toussaint, Rogera C.)]","[(Greene, Nieta M.)]"
8,18H84CV000537,"1 Dove, Street","Dorchester, MA, 02125",09/06/2018,09/07/2018,"[(Waite, Austin)]","[(Thompson, Samira)]"
9,18H84CV000547,"56 Charles Street, 3","Dorchester, MA, 02122",09/12/2018,09/20/2018,"[(Blaisdell, Stephanie)]","[(Sheals, Sandra)]"


In [400]:
df[df.case == "18H84SP004084"]

Unnamed: 0,case,street,city,file date,close date,Defendant,Plaintiff
68,18H84SP004084,"508 Massachusetts Avenue, 4","Boston, MA, 02118",09/05/2018,09/18/2018,"[(Monroig, Angel), (Salerno, Lisa)]","[(Tenants' Development Corporation,)]"


In [369]:
df[df.case == "18H84SP004084"]["Defendant"]

18H84SP004084    [Monroig, Angel, Salerno, Lisa]
Name: Defendant, dtype: object

In [402]:
# saving our processed data to file
df.to_csv("court_data.csv")