In [1]:
from bs4 import BeautifulSoup
from dataclasses import dataclass
from time import sleep
from datetime import date
import pandas as pd 
import altair as alt 
import re
import requests
from selenium.webdriver import Chrome
from geopy.geocoders import Nominatim

In [5]:
def get_page_range(element):
    string = element.text
    pat = re.compile(r'[0-9]')
    m = re.match(pat, string)
    return m

In [13]:
# functions
def parse_price(element, expected_length, is_range=True):
    '''
    Function to parse and return [low, high] value of
    rent string.

    args:   price_str (str) webscraped rent price
            is_range  (bool) flag to indicate if ' - '
                             needs to be removed
    '''
    assert isinstance(expected_length, int), 'expected_length must be type(int)'
    assert isinstance(element, list), 'Rent must enter function as type(list)'

    highs, lows = [], []
    pat1 = re.compile(r'[\$\,]')
    for r in element:
        try:
            price_str = r.text
            s = re.sub(pat1, '', price_str)
            if is_range:
                pat2 = re.compile(r'\s\-\s')
                s = re.split(pat2, s)
                low, high= s[0], s[1]
                highs.append(high)
                lows.append(low)
                continue
            highs.append(high)
            lows.append(low)
        
        except:
            highs.append(None)
            lows.append(None)
    
    lows = fill_missing(lows, expected_length)
    highs = fill_missing(highs, expected_length)
    
    return lows, highs


def get_page_range(element_text):
    '''
    Function to return [first_page, last_page] of webdriver element.

    args:   element_text (str) webdriver element.text
    '''
    assert isinstance(element_text, str), '''Function can only parse type(str)'''

    pat = re.compile(r'[0-9]+')
    matches = re.finditer(pat, element_text)
    if matches:
        return [int(m.group()) for m in matches]

def get_amenities(amen_element, expected_length):
    if amen_element == []:
        return [None] * expected_length

    all_amens = []
    for a in amen_element:
        try:
            items = a.find_elements_by_tag_name('li')
            amenities = [item.get_attribute('title') for item in items]
            all_amens.append(amenities)

        except:
            all_amens.append(None)

    all_amens = fill_missing(all_amens, expected_length)

    return all_amens
        

def get_geoloc(loc_element, agent='googlev3', check_against=(), dist_tolerance=25.0):
    '''
    Function to take in location webdriver element
    and return geo coordinates.

    Function calls to package geopy, uses argument:agent,
    and returns latitude, longitude of street address.

    args:   >loc_element (list) webdriver list of locations
            >agent (str) user agent to be passed to 
            geopy.geolocator
            >check_against (tuple) optional argument that 
            provides lat and long coordinates as a reference
            (i.e. the center point of a city or a capital)
            >dist_tolerance (float): distance value (in miles)
            a location can be from the reference coords in 
            'check_against'. If too far, function breaks.
    '''

    asrt1 = f'argument loc_element must be type(list)'
    asrt2 = f'argument agent must be type(str)'
    asrt3 = f'argument loc_element must be type(tuple)'
    asrt4 = f'argument loc_element must be type(float)'

    assert isinstance(loc_element, list), asrt1
    assert isinstance(agent, str), asrt2
    assert isinstance(check_against, tuple), asrt3
    if isinstance(dist_tolerance, int):
        dist_tolerance = float(dist_tolerance)
    assert isinstance(dist_tolerance, float), asrt4

    geolocator = Nominatim(user_agent=agent)
    addresses = []
    lats, longs = [], []
    for loc in loc_element:
        address = loc.get_attribute('title')
        addresses.append(address)
        try:
            location = geolocator.geocode(address)
            lati, longi = location.latitude, location.longitude
            lats.append(lati)
            longs.append(longi)
        except:
            lats.append(None)
            longs.append(None)
    return addresses, lats, longs

def construct_frame(street_adds, lows, highs, amens, lats, longs):

    if not len(street_adds)==len(lows)==len(highs)==len(amens)==len(lats)==len(longs):
        print(f'len of street_adds = {len(street_adds)}')
        print(f'len of lows = {len(lows)}')
        print(f'len of highs = {len(highs)}')
        print(f'len of amens = {len(amens)}')
        print(f'len of lats = {len(lats)}')
        print(f'len of longs = {len(longs)}')

    today = str(date.today())

    inf_dict = pd.DataFrame({
        'query_date':[today for i in range(len(street_adds))],
        'street_address':street_adds,
        'rent_low':lows,
        'rent_high':highs,
        'latitude':lats,
        'longitude':longs
    })

    if amens != [None] * len(street_adds):
        amns = amens_to_dict(street_adds, amens)

        df = pd.merge(left=inf_dict, left_on='street_address',
                    right=amns, right_on='address', how='outer',
                    right_index=False).drop('address', axis=1)
        return df
    return pd.DataFrame(inf_dict)

def amens_to_dict(street_adds, amenities):
    df = pd.DataFrame()
    a_dict = {}
    for z in zip(addresses, amenities):
        # print(z)
        addr = z[0]
        amns = z[1]
        if amns is None:
            continue
        a_dict['address'] = [addr]
        for a in amns:
            a_dict[a] = a_dict.get(a, ['Y'])
        if df.empty:
            df = pd.DataFrame(a_dict)
        else:
            df = pd.concat([df, pd.DataFrame(a_dict)])
    return df

def fill_missing(data_list, expected):
    remaining = expected - len(data_list)
    data_list.extend([None]*remaining)
    return data_list

def combine_dicts(a_dict, b_dict):
    for key in b_dict:
        a_dict[key] = b_dict.get(key, None)

def pull_data(file_to_match='MadisonLiving.csv', directory='.'):
    '''
    Function searches through given directory to find existing data.

    args:   file_to_match   (str)   Filename to look for.
            directory       (str)   Directory to search.
    '''
    for root, dirs, files in os.walk(directory):
        for file in files:
            name = os.path.abspath(os.path.join(root, file))
            base = os.path.basename(name)
            if base == file_to_match:
                return pd.read_csv(name)
    text = [f'No existing data file found in {os.path.abspath(directory)}',
            'Initialize new dataframe? (y/n)']
    resp = input('\n'.join(text)).lower()
    if resp == 'y':
        return pd.DataFrame()
    if resp == 'n':
        raise Exception

In [14]:
url = r'https://www.apartments.com/madison-wi/?bb=wxr38t32gKv6mm81F'
# url_='https://www.apartments.com/madison-wi/3/?bb=wxr38t32gKv6mm81F'
# page21 = 'https://www.apartments.com/madison-wi/21/?bb=wxr38t32gKv6mm81F'

driver = Chrome()
driver.get(url)
sleep(5)

num_pages = driver.find_element_by_class_name('pageRange')
_, last_page = get_page_range(num_pages.text)

store_under = 'MadisonLiving.csv'     # filename to be used for data storage/retrieval

df = pull_data(file_to_match=store_under)

for i in range(last_page):
    print(f'on page {i+1}')
    locs = driver.find_elements_by_class_name('location')
    addresses, lats, longs = get_geoloc(locs)
    # print(lats, longs)
    rents = driver.find_elements_by_class_name('altRentDisplay')
    lows, highs = parse_price(rents, len(locs))
    # print(lows, highs)
    amens = driver.find_elements_by_class_name('amenities')
    amenities = get_amenities(amens, len(locs))
    # print(amenities)
    
    sub = construct_frame(addresses, lows, highs, amenities, lats, longs)
    if df.empty:
        df = sub
    else:
        df = pd.concat([df, sub], ignore_index=True)

    nxt_button = driver.find_element_by_class_name('next ')
    if i == last_page - 1:      # no more pages can be found
        break
    nxt_button.click()
    sleep(5)

        # print(len(locs), len(rents), len(amens))
driver.close()
display(df)
df.to_csv(store_under)



on page 1
on page 2
on page 3
on page 4
on page 5
on page 6
on page 7
on page 8
on page 9
on page 10
on page 11
on page 12
on page 13
on page 14
on page 15
on page 16
on page 17
on page 18
on page 19
on page 20
on page 21
on page 22
on page 23
on page 24
on page 25
on page 26
on page 27
on page 28


Unnamed: 0.1,Unnamed: 0,query_date,street_address,rent_low,rent_high,latitude,longitude,Dog Friendly,Cat Friendly,Air Conditioning,Washer/Dryer - In Unit,Dishwasher,Fitness Center,Wheelchair Access,Parking
0,0.0,2020-08-30,"1827 E Washington Ave, Madison, WI 53704",1255,2210,43.090496,-89.360433,Y,Y,Y,Y,Y,,,
1,1.0,2020-08-30,"622 W Wilson St, Madison, WI 53703",1200,2295,43.066273,-89.390348,Y,Y,Y,Y,Y,,,
2,2.0,2020-08-30,"309 W Johnson St, Madison, WI 53703",1550,4630,43.074263,-89.390136,Y,Y,Y,Y,Y,,,
3,3.0,2020-08-30,"1008 S Park St, Madison, WI 53715",1399,3599,43.056685,-89.398746,Y,Y,Y,Y,Y,,,
4,4.0,2020-08-30,"5123 Central Park Pl, Fitchburg, WI 53711",1035,1695,43.012518,-89.394039,Y,Y,Y,Y,Y,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,,2020-09-06,"Madison, WI 53719",,,43.074761,-89.383761,,,,,,,,
1396,,2020-09-06,"Madison, WI 53716",,,43.074761,-89.383761,,,,,,,,
1397,,2020-09-06,"4702 Dutch Mill Rd, Madison, WI 53716",,,43.049971,-89.301034,,,,,,,,
1398,,2020-09-06,"Madison, WI 53716",,,43.074761,-89.383761,,,,,,,,


In [26]:
# df = pd.read_csv('MadisonLiving.csv')
# df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
# df.to_csv('MadisonLiving.csv', index=False)



In [38]:
df = pd.read_csv('MadisonLiving.csv')

In [42]:
df1 = df.street_address.str.split(pat=',', expand=True)
df1.columns = ['street_address', 'city', 'area_code']
for i, addr, city, area in df1.itertuples():
    if len(addr.split(' '))==1:
        df1.loc[i, 'area_code'] = df1.loc[i, 'city']
        df1.loc[i, 'city'] = df1.loc[i, 'street_address']
        df1.loc[i, 'street_address'] = None

df2 = df1.area_code.str.split(pat=' ', expand=True)
df1.drop('area_code', axis=1, inplace=True)
df2.columns = ['blank', 'state', 'area_code']
df2.drop('blank', axis=1, inplace=True)

# df2.drop('blank', axis=1, inplace=True)

small = df1.join(df2, how='outer')

test_df = df.drop('street_address', axis=1)
test_df = pd.merge(left=test_df, left_on=test_df.index, right=small, right_on=small.index, right_index=False)
test_df.drop('key_0', axis=1, inplace=True)
cols = list(test_df.columns)
cols[:] = cols[-4:] + cols[:-4]
test_df = test_df[cols]
test_df



# for i, addr in enumerate(df.street_address):
#     # if i == 5:
#     #     break
#     s = addr.split(',')
#     # print(s[-1].strip())
#     if len(s) == 2:
#         print(s)

Unnamed: 0,street_address,city,state,area_code,query_date,rent_low,rent_high,latitude,longitude,Dog Friendly,Cat Friendly,Air Conditioning,Washer/Dryer - In Unit,Dishwasher,Fitness Center,Wheelchair Access,Parking
0,1827 E Washington Ave,Madison,WI,53704,2020-08-30,1255.0,2210.0,43.090496,-89.360433,Y,Y,Y,Y,Y,,,
1,622 W Wilson St,Madison,WI,53703,2020-08-30,1200.0,2295.0,43.066273,-89.390348,Y,Y,Y,Y,Y,,,
2,309 W Johnson St,Madison,WI,53703,2020-08-30,1550.0,4630.0,43.074263,-89.390136,Y,Y,Y,Y,Y,,,
3,1008 S Park St,Madison,WI,53715,2020-08-30,1399.0,3599.0,43.056685,-89.398746,Y,Y,Y,Y,Y,,,
4,5123 Central Park Pl,Fitchburg,WI,53711,2020-08-30,1035.0,1695.0,43.012518,-89.394039,Y,Y,Y,Y,Y,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,,Madison,WI,53719,2020-09-06,,,43.074761,-89.383761,,,,,,,,
1396,,Madison,WI,53716,2020-09-06,,,43.074761,-89.383761,,,,,,,,
1397,4702 Dutch Mill Rd,Madison,WI,53716,2020-09-06,,,43.049971,-89.301034,,,,,,,,
1398,,Madison,WI,53716,2020-09-06,,,43.074761,-89.383761,,,,,,,,


In [55]:
d = test_df.iloc[1398:, :]
d.fillna(0)

Unnamed: 0,street_address,city,state,area_code,query_date,rent_low,rent_high,latitude,longitude,Dog Friendly,Cat Friendly,Air Conditioning,Washer/Dryer - In Unit,Dishwasher,Fitness Center,Wheelchair Access,Parking
1398,0,Madison,WI,53716,2020-09-06,0.0,0.0,43.074761,-89.383761,0,0,0,0,0,0,0,0
1399,0,Madison,WI,53704,2020-09-06,0.0,0.0,43.074761,-89.383761,0,0,0,0,0,0,0,0


In [112]:
df1

Unnamed: 0,street_address,city,area_code
0,1827 E Washington Ave,Madison,WI 53704
1,622 W Wilson St,Madison,WI 53703
2,309 W Johnson St,Madison,WI 53703
3,1008 S Park St,Madison,WI 53715
4,5123 Central Park Pl,Fitchburg,WI 53711
...,...,...,...
695,,Madison,WI 53719
696,,Madison,WI 53716
697,4702 Dutch Mill Rd,Madison,WI 53716
698,,Madison,WI 53716


In [95]:
df2

Unnamed: 0,blank,state,area code
0,,WI,53704
1,,WI,53703
2,,WI,53703
3,,WI,53715
4,,WI,53711
...,...,...,...
695,,,
696,,,
697,,WI,53716
698,,,


In [56]:
source = alt.topo_feature(data.world_110m.url, 'countries')

base = alt.Chart(source).mark_geoshape(
    fill='#666666',
    stroke='white'
).properties(
    width=300,
    height=180
)

latitude = [55.864691552,
 55.866051551999995,
 55.867071552,
 55.867751552,
 55.869451552,
 55.871151552,
 55.873361552,
 55.875401552,
 55.876931551999995,
 55.877611552]
longitude = [-4.435331875,
 -4.435841875,
 -4.433801875,
 -4.429891875,
 -4.428361875,
 -4.427171875,
 -4.425301875,
 -4.421561875,
 -4.418841875,
 -4.418501875]

gps_data = pd.DataFrame({"latitude": latitude, "longitude": longitude})

gps = alt.Chart(gps_data).mark_circle(size=3).encode(
    longitude='longitude:Q',
    latitude='latitude:Q'
).project(
    type='mercator'
)

gps

In [118]:
import googlemaps
from datetime import datetime
from KEYS import *
gmaps = googlemaps.Client(key=GOOGLE_API_KEY)

# Geocoding an address
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')

# Look up an address with reverse geocoding
reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))

# Request directions via public transit
now = datetime.now()
directions_result = gmaps.directions("Sydney Town Hall",
                                     "Parramatta, NSW",
                                     mode="transit",
                                     departure_time=now)

In [32]:
gmaps = googlemaps.Client(key='AIzaSyBOrOV5q4C77KxZYMC04TuQm16lAaUdPZg')

r = googlemaps.maps.static_map(gmaps, 100, center='Dehradun', zoom=10)

In [55]:
f = open('Test2.png', 'wb')
for chunk in gmaps.static_map(size=(800, 800),
                                center=(52.520103, 13.404871),
                                zoom=15, format='gif',
                                markers=[],
                                maptype='roadmap'):
    if chunk:
        f.write(chunk)
f.close()