## Web Scraping - Individual Compass Listings

Objective: write a script that scrapes relevant information from **rental property** listings.

Features to scrape:
* Bedrooms ✓
* Bathrooms ✓
* Location ✓
* Rent Price ✓
* Square footage ✓
* Year built ✓
* MLS # ✓
* MLS Type ✓
* Furnished ✓
* Laundry ✓
* Parking Spaces ✓

_Note: walk score information will need to be appended separately since Compass does not list this information_

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import pickle
import time
import random
#import rental_listing

---

**To Do:**
- organize github project repo ☺
- make functions pull info directly from soup ☺
- set up looping functionality to address section and beyond ☺
- figure out how to store functions in .py file ☹

---


In [17]:
def get_location_vars(soup):
    location_soup = soup.find('div', class_='app__StyledLocation-sc-1qqu9tk-26 ljAjTR section-padding')
    location = [i.text.split(', ') for i in location_soup.find_all('li')]
    if len(location[-1]) == 2:
        location.append(location[-1])
        location[-2] = [location[-2][0]]
        location[-1] = [location[-1][1]]
    else:
        location.append(None)
    location = [i[0] for i in location]
    return location

def get_price_bed_bath(soup):
    
    price_bed_bath = [i.find('div', class_ = 'textIntent-title2').text.lower()
                       for i in soup.find_all(class_='summary__StyledSummaryDetailUnit-e4c4ok-13 dsPYTb')]
    if price_bed_bath[1] == 'studio':
        price_bed_bath[1] = 0
    price_bed_bath[0] = price_bed_bath[0][1:].replace(',','')
    price_bed_bath = [float(i) for i in price_bed_bath]
    #some properties list half baths, which appears as a third integer in the price_bed_bath list
    if len(price_bed_bath) < 4:
        while len(price_bed_bath) < 4:
            price_bed_bath.append(0)
    return price_bed_bath

def get_sqft(soup):
    sqft_raw = soup.find('div', class_ = 'sc-fzqKVi custom-ranges-hide__CustomRangesHide-sc-19a3hp9-0 fEUcgh u-flexContainer--row')\
    .find('div', class_ = "textIntent-title2").text
    try:
        sqft = int(sqft_raw.replace(',', ''))
    except ValueError:
        sqft = 'NaN'
    return sqft

def parse_main_table(soup): 
    main_table_td = [i.text for i in soup.find('table').find_all('td')]
    main_table_th = [i.text for i in soup.find('table').find_all('th')]
    m_f_mt_y_c = []
    global m_f_mt_y_c_headers
    m_f_mt_y_c_headers = ['MLS #', 'Furnished', 'MLS Type', 'Year Built', 'County']
    for i in zip(main_table_th, main_table_td):
        if i[0] in m_f_mt_y_c_headers:
            m_f_mt_y_c.append(i[1])
    return m_f_mt_y_c

def get_laundry_type(soup):
    desc_puller = soup.text.lower()
    in_unit_terms = ['in-unit', 'in unit', 'laundry: yes', 'laundry: washer/dryer', 'laundry: dryer, washer', 'laundry: washer, dryer']
    laundry_snippet = 'none'
    for term in in_unit_terms:
        if term in desc_puller:
            in_unit = 1
            laundry_snippet = desc_puller[desc_puller.find(term)-25:desc_puller.find(term)+25]
            break
        else:
            in_unit = 0
    return [in_unit, laundry_snippet]

def get_parking(soup):
    all_text = soup.text.lower()
    parking = ''
    #check if the number of parking spaces is listed in standard field
    parking_index_standard_field = all_text.find('num of parking spaces')
    idx_num_parking_spaces = parking_index_standard_field + len('num of parking spaces')
    
    parking = 'parking_unknown'
    
    if parking_index_standard_field >= 0:
        try:
            if int(all_text[idx_num_parking_spaces]) > 0:
                parking = 'parking'
            else:
                parking = 'no_parking'
        except ValueError:
            pass
            parking = 'parking_unknown'
    
    snippet = all_text[all_text.find('parking')-10:
                                   all_text.find('parking')+30]
    
    return [parking, snippet]

---

load pickles

---

In [28]:
with open('alameda_county_listings_200114.pickle','rb') as read_file:
    listings = pickle.load(read_file)

In [29]:
listings[0]



In [12]:
#use the requests library to grab information from the web

# perkins = '314-perkins-street-unit-210-oakland-ca-94610/675388719410670665/'
# caldecott = '240-caldecott-lane-unit-314-oakland-ca-94618/691931505828679169/'
# manila = '5241-manila-avenue-oakland-ca-94618/692224378130178033/'
# ninth_st = '585-9th-street-unit-319-oakland-ca-94607/670361183768239601/'

rental_url_base = 'https://www.compass.com'
urls = [rental_url_base + details for details in url_details]

#responses = [requests.get(url) for url in urls]

responses = []

for url in urls:
    response = requests.get(url)
    responses.append(response)
    print(len(responses))
    time.sleep(random.randint(1,3))
    

# 200 = success
# [response.status_code for response in responses]


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [13]:
# compile text from each listing into a list
listings = [response.text for response in responses]

In [14]:
with open('san_francisco_county_listings_200115.pickle', 'wb') as to_write:
    pickle.dump(listings, to_write)

In [15]:
# use beautiful soup to parse the page
rentals_soup = [BeautifulSoup(listing, "lxml") for listing in listings]

In [16]:
get_location_vars(rentals_soup[2])

['San Francisco', '94131', 'Noe Valley', '1647 Sanchez Street', 'N/A']

---
---

**address information**

---
---

![image.png](attachment:023d9371-3699-4583-aaf3-0631ac0463a8.png)

---

In [332]:
locations = []
for rental in rentals_soup:
    try:
        location = get_location_vars(rental)
        print(location)
        locations.append(location)
    except AttributeError:
        location = ['unknown']
        print(location)
        locations.append(location)
        
#locations = [get_location_vars(rental) for rental in rentals_soup]
#locations

['Oakland', '94618', 'Parkwoods', '240 Caldecott Lane', 'Unit 314']
['Albany', '94706', 'Hillside', '625 Jackson Street', 'Unit 2']
['Emeryville', '94608', 'North Hollis', '1254 65th Street', 'N/A']
['Berkeley', '94702', "Poet's Corner", '1338 Addison Street', 'N/A']
['Emeryville', '94608', 'Watergate', '4 Anchor Drive', 'Unit F241']
['San Leandro', '94578', 'East Oakland Terrace', '1985 170th Avenue', 'N/A']
['Oakland', '94618', 'Upper Rockridge', '5844 Margarido Drive', 'N/A']
['Oakland', '94610', 'Adams Point', '306 Lee Street', 'Unit 304']
['Berkeley', '94709', 'North Berkeley', '1609 Bonita Avenue', 'Unit 1']
['Oakland', '94607', 'Lower Bottoms', '1810 14th Street', 'N/A']
['Oakland', '94607', 'Chinatown', '801 Franklin Street', 'Unit 722']
['Oakland', '94610', 'Rose Garden', '551 Jean Street', 'N/A']
['Oakland', '94611', 'Harrison St-Oakland Ave', '3273 Kempton Avenue', 'N/A']
['Oakland', '94611', 'Piedmont Avenue', '11 Monte Vista Avenue', 'N/A']
['Emeryville', '94608', 'North H

In [333]:
address_cols = ['City', 'Zip', 'Neighborhood', 'Address_1', 'Address_2']
data_so_far = pd.DataFrame(columns = address_cols,
                            data = locations)
data_so_far.shape

(190, 5)

---
---

**price, beds, baths, sq footage**

---
---

![image.png](attachment:0a08e293-3532-4f7c-911d-d062e8feee91.png)

---

In [340]:
beds_baths = [get_price_bed_bath(rental) for rental in rentals_soup]
beds_baths

[[2500.0, 1.0, 1.0, 0],
 [2850.0, 2.0, 2.0, 0],
 [4250.0, 2.0, 2.0, 1.0],
 [2375.0, 2.0, 1.0, 0],
 [2275.0, 1.0, 1.0, 0],
 [3900.0, 3.0, 3.0, 0],
 [8995.0, 5.0, 4.0, 1.0],
 [2595.0, 1.0, 1.0, 0],
 [3100.0, 2.0, 1.0, 0],
 [3000.0, 2.0, 2.0, 1.0],
 [3000.0, 2.0, 1.0, 1.0],
 [1995.0, 1.0, 1.0, 0],
 [3750.0, 3.0, 2.0, 0],
 [1650.0, 1.0, 1.0, 0],
 [2325.0, 1.0, 1.0, 0],
 [2500.0, 2.0, 1.0, 0],
 [2500.0, 2.0, 1.0, 0],
 [2495.0, 2.0, 1.0, 1.0],
 [2950.0, 2.0, 2.0, 0],
 [3950.0, 3.0, 2.0, 0],
 [1998.0, 0.0, 1.0, 0],
 [2650.0, 2.0, 2.0, 0],
 [3700.0, 2.0, 1.0, 1.0],
 [3300.0, 3.0, 1.0, 0],
 [4000.0, 2.0, 2.5, 0],
 [2975.0, 3.0, 1.0, 0],
 [2200.0, 2.0, 1.0, 0],
 [6000.0, 3.0, 3.0, 0],
 [3000.0, 3.0, 2.0, 0],
 [2650.0, 2.0, 1.0, 0],
 [3150.0, 3.0, 2.0, 0],
 [2800.0, 3.0, 2.0, 0],
 [2895.0, 2.0, 2.0, 0],
 [2700.0, 3.0, 2.0, 0],
 [1750.0, 1.0, 1.0, 0],
 [2500.0, 1.0, 1.0, 0],
 [2450.0, 2.0, 2.0, 0],
 [4100.0, 4.0, 3.0, 0],
 [2880.0, 2.0, 1.0, 0],
 [2300.0, 2.0, 1.0, 0],
 [2300.0, 2.0, 1.0, 1.0],
 [

In [342]:
price_bed_bath_cols = ['Price', 'Beds', 'Baths', 'Half_Baths']
pbb_df = pd.DataFrame(columns = price_bed_bath_cols, data = beds_baths)
data_so_far = pd.concat([data_so_far, pbb_df], axis = 1)
data_so_far['Sq_Footage'] = [get_sqft(rental) for rental in rentals_soup]
data_so_far.shape

(190, 14)

---
---

**MLS#, furnished, MLS type, year built, county**

---
---

![image.png](attachment:5ac2f10e-8130-4a29-ae01-7b05c983ce3c.png)

---

In [343]:
ms_fs_mts_ys_cs = [parse_main_table(rental) for rental in rentals_soup]

In [344]:
main_table_df = pd.DataFrame(columns = m_f_mt_y_c_headers, data = ms_fs_mts_ys_cs)

In [345]:
data_so_far = pd.concat([data_so_far, main_table_df], axis = 1)
data_so_far.head()

Unnamed: 0,City,Zip,Neighborhood,Address_1,Address_2,Price,Beds,Baths,Half_Baths,Sq_Footage,Price.1,Beds.1,Baths.1,Half_Baths.1,MLS #,Furnished,MLS Type,Year Built,County
0,Oakland,94618,Parkwoods,240 Caldecott Lane,Unit 314,2500.0,1.0,1.0,0.0,773,2500.0,1.0,1.0,0.0,511967,-,Apartment Lease / Apartment,1996,Alameda County
1,Albany,94706,Hillside,625 Jackson Street,Unit 2,2850.0,2.0,2.0,0.0,1112,2850.0,2.0,2.0,0.0,40933893BR,-,Residential Lease,-,Alameda County
2,Emeryville,94608,North Hollis,1254 65th Street,,4250.0,2.0,2.0,1.0,1296,4250.0,2.0,2.0,1.0,40932990,-,LEASE RENTAL / Apartment/Condo for Rent,2003,Alameda County
3,Berkeley,94702,Poet's Corner,1338 Addison Street,,2375.0,2.0,1.0,0.0,600,2375.0,2.0,1.0,0.0,40933389BR,-,Residential Lease,-,Alameda County
4,Emeryville,94608,Watergate,4 Anchor Drive,Unit F241,2275.0,1.0,1.0,0.0,660,2275.0,1.0,1.0,0.0,40933210,-,LEASE RENTAL / Apartment/Condo for Rent,1973,Alameda County


---
---

**washer/dryer in-unit**

---
---

**the issue**: there are several possible locations on the rental listing home page for laundry information to be listed and the inputs are not standardized. 

**working solution**: using the observation that properties that have laundry available tend to mention this in the property description, find the term ```'laundry'``` or ```'washer'``` and store a few words on either side in a dataframe. From this text snippet, find a way to categorize listings as ```'no laundry'```, ```'laundry in building'```, or ```'laundry in-unit'```.

In [346]:
laundries = [get_laundry_type(rental)[0] for rental in rentals_soup]
laundry_snippets = [get_laundry_type(rental)[1] for rental in rentals_soup]

In [347]:
data_so_far['laundry_type'] = laundries
data_so_far['laundry_snippet'] = laundry_snippets
data_so_far.shape

(190, 21)

---
---

**parking spaces**

---
---

In [348]:
parking_spots = [get_parking(rental)[0] for rental in rentals_soup]
parking_snippet = [get_parking(rental)[1] for rental in rentals_soup]

In [349]:
parking_snippet

['raditionalparking typedesignated on site',
 'esignated parking space in the carport. ',
 'es2garage/parkingspace per unit - 2, tan',
 'nstate: caparkingcovered parking: 0parki',
 'es1garage/parkingspaces - assigned, spac',
 'es1garage/parkingattached garage, off st',
 'es2garage/parkingcarport - 1viewbridges,',
 'es0garage/parkingattached garage, off st',
 'agegarage parking features: assigned spa',
 'entgarage/parkingside-by-sideproperty de',
 'es1garage/parkingspace per unit - 1, bel',
 ' assigned parking spot in garage, pool a',
 ' driveway parking or street parking - in',
 'es0garage/parkingno parking on site, on ',
 'esignated parking space with plenty of g',
 'hstate: caparkingcovered parking: 0parki',
 'ne windowsparking infoattached garage: n',
 'one gated parking space included. on-sit',
 '1 secured parking space lease terms : - ',
 'es0garage/parkingnoneproperty details fo',
 'panish/medparking typenonenum of parking',
 'esignated parking space with plenty of g',
 'es1garag

In [350]:
data_so_far['parking_spots'] = parking_spots
data_so_far['parking_snippet'] = parking_snippet

In [351]:
data_so_far.head()

Unnamed: 0,City,Zip,Neighborhood,Address_1,Address_2,Price,Beds,Baths,Half_Baths,Sq_Footage,...,Half_Baths.1,MLS #,Furnished,MLS Type,Year Built,County,laundry_type,laundry_snippet,parking_spots,parking_snippet
0,Oakland,94618,Parkwoods,240 Caldecott Lane,Unit 314,2500.0,1.0,1.0,0.0,773,...,0.0,511967,-,Apartment Lease / Apartment,1996,Alameda County,1,ppliances plus full size in-unit washer dryer....,parking,raditionalparking typedesignated on site
1,Albany,94706,Hillside,625 Jackson Street,Unit 2,2850.0,2.0,2.0,0.0,1112,...,0.0,40933893BR,-,Residential Lease,-,Alameda County,0,none,parking_unknown,esignated parking space in the carport.
2,Emeryville,94608,North Hollis,1254 65th Street,,4250.0,2.0,2.0,1.0,1296,...,1.0,40932990,-,LEASE RENTAL / Apartment/Condo for Rent,2003,Alameda County,1,"woodlaundry: in closet, in unitpool type: non...",parking_unknown,"es2garage/parkingspace per unit - 2, tan"
3,Berkeley,94702,Poet's Corner,1338 Addison Street,,2375.0,2.0,1.0,0.0,600,...,0.0,40933389BR,-,Residential Lease,-,Alameda County,0,none,parking_unknown,nstate: caparkingcovered parking: 0parki
4,Emeryville,94608,Watergate,4 Anchor Drive,Unit F241,2275.0,1.0,1.0,0.0,660,...,0.0,40933210,-,LEASE RENTAL / Apartment/Condo for Rent,1973,Alameda County,0,none,parking_unknown,"es1garage/parkingspaces - assigned, spac"


In [353]:
data_so_far.shape

(190, 23)

In [354]:
with open('alameda_county_data_200114.pickle', 'wb') as to_write:
    pickle.dump(data_so_far, to_write)