In [1]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import json
from gensim.utils import deaccent
import random
import os
import math

In [2]:
#geocoding
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar, NumeralTickFormatter
from bokeh.palettes import brewer

from bokeh.io.doc import curdoc
from bokeh.models import Slider, HoverTool, Select
from bokeh.layouts import widgetbox, row, column
import geopy
import geopandas as gpd
from geopy.extra.rate_limiter import RateLimiter

In [3]:
#visualisation
import folium
import folium.plugins as plugins
from folium.plugins import MarkerCluster
#from ipywidgets import interact

## Establishing functions

In [None]:
 def get_soups(links, name):
        '''
        This function iterates over all search pages, converts them into a BeautifulSoup object and stores them in a JSON file as 
        outside of this script. The keys of the dictoniary distinguish here between the different objects/HTML-pages. 
        '''
        count = 0
        dict_ = {}
        soups = []
        for link in links:
            sleep(random.uniform(0.5, 2))
            request = requests.get(link)
            request.encoding='UTF-8'
            soups.append(BeautifulSoup(request.text,'lxml'))
        for soup in soups:
            dict_[count] = str(deaccent(soup).encode("utf-8"))
            count += 1
        with open(name, 'w') as write_file:
            json.dump(dict_, write_file, indent = 4)
        
        '''
        with open(name, 'w') as f:
            for s in soups:
                f.write(str(deaccent(s).encode("utf-8")))
            f.close
            
        '''

## Getting the data from websites

In [5]:
fileDir = os.path.dirname(os.path.realpath('__file__'))
'''Create a new Folder "Data" in the current working directory to store & access the data files which will be produced throughout this script'''
newfolder = r'Data' 
if not os.path.exists(newfolder): #if already exists will not be created again
    os.makedirs(newfolder)

In [None]:
counter = 0


In [None]:
class DownloaderBezRealitky(): #error prone, need to correct
    def __init__(self):
        '''
        For the bezrealitky search, you need to iterate over search pages. Self.page_bezrealitky stores the maximum amount
        of pages and then via self.link a list of all pages from search is created in self.hrefs_bezrealitky.
        '''
        self.link = 'https://www.bezrealitky.cz/vypis/nabidka-pronajem/byt/praha?_token=pr1lf-vKwDFfmFbICiz2PfC-Zdwq-2JolXi4MeMHsrw&page=1'
        self.request = requests.get(self.link)
        self.request.encoding='UTF-8'
        self.soup = BeautifulSoup(self.request.text,'lxml')
        self.page_bezrealitky = int(self.soup.findAll('a',{'class':'page-link pagination__page'})[-2].text)
        self.hrefs_bezrealitky = ['https://www.bezrealitky.cz/vypis/nabidka-pronajem/byt/praha?_token=pr1lf-vKwDFfmFbICiz2PfC-Zdwq-2JolXi4MeMHsrw&page=' 
                                  + str(i) for i in range(1,self.page_bezrealitky)]
        self.soups = []
        self.counter = counter

    def get_data(self):
        '''
        Main method to obtain and transform the data. HTMLs are read from the JSON file and stored in a list (soup_list) 
        within this script. Next, the method iterates over the list, converts the strings in the list into a BeautifulSoup
        object and parses the html for relevant data. At the end, a nested dictionary (dicts) is created and stored
        as a json file outside of this script.
        '''
        with open(fileDir + '\\Data\\bezrealitky_links.json', 'r', encoding='utf-8') as f:
            content = json.load(f)
        soup_list = list(content.values())
        dicts = {}
        counter = 0
        for soup in soup_list:
            descrips = [] #empty list for apartment values
            values = [] #empty list for apartment prices
            vals = BeautifulSoup(soup,'lxml').findAll('strong', {'class':'product__value'}) #parsing for apartment values
            ##vals = soup.findAll('strong', {'class':'product__value'})
            for vl in vals:
                values.append(vl.text.strip())
            #img = soup.findAll('img')
            img = BeautifulSoup(soup,'lxml').findAll('img') #parsing for apartment info (street, city, size..)
            for i in img:
                if 'Pronajem' and 'obr. c. 1' in i['alt']: #info present at all pictures, let's take info from the first one
                        info = i['alt'].split(',')[0:4] #info separated by comma, split into a list
                        if 'Praha' == info[-1].strip(): #if street non present, insert a NaN instead
                            info.insert(2, 'NaN')
                            del info[-1]
                            m = info[1].split(' ')
                            info[1] = m[1]
                            descrips.append(info)
                        else:
                            m = info[1].split(' ')
                            info[1] = m[1]
                            descrips.append(info)
            count = 0
            for pp in values: #append apartment prices to info about apartments in list descrips
                try:
                    descrips[count].append(pp)
                    descrips[count][0] = descrips[count][0][-4:].strip()
                    count += 1
                except IndexError:
                    count += 1
                    continue
            for item in descrips:
                try:
                    if '+' in item[4]: #prices often written as '19000 Kč + 4000Kč' so we need to split it
                        prices = item.pop(4).split('+')
                        item.append(re.sub("[^0-9]", "", prices[0])) #keep only numeric characters, i.e. price
                        item.append(re.sub("[^0-9]", "", prices[1]))
                    else:
                        prices = [item.pop(4), '0'] #if only '19000 Kč', insert 0 as price for utilities not specified
                        item.append(re.sub("[^0-9]", "", prices[0]))
                        item.append(re.sub("[^0-9]", "", prices[1]))
                except IndexError:
                    continue
            for item in descrips: #store apartment info, price into a dictionary and index by counter
                try:
                    dict = {}
                    dict['Size'] = item[0]
                    dict['m2'] = re.sub("[^0-9]", "", item[1]) #keep only size, i.e. numeric characters
                    dict['Street'] = deaccent(item[2]) #deaccent to provent potential errors
                    dict['District'] = deaccent(item[3])
                    dict['Base Price'] = int(item[4])
                    dict['Utilities Price'] = int(item[5])
                    dict['Total Price'] = int(item[4]) + int(item[5])
                    dict['Source'] = 'bezrealitky.cz'
                    dicts[self.counter] = dict
                    self.counter += 1
                except IndexError:
                    #counter +=1
                    continue
            print('Done loop number ' + str(self.counter) + '. Printing descrips.')
        with open(fileDir + '\\Data\\bezrealitky.json', 'w') as write_file: #store data into a json file
            json.dump(dicts, write_file, indent = 4)

        
            
        

In [None]:
a = DownloaderBezRealitky()

In [None]:
get_soups(a.hrefs_bezrealitky, fileDir + '\\Data\\bezrealitky_links.json')

In [None]:
a.get_data()

In [None]:
counter = a.counter

In [None]:
class DownloaderReality():
    def __init__(self):
        '''
        For the reality search, you need to iterate over search pages. Self.page_reality stores the maximum amount
        of pages and then via self.link a list of all pages from search is created in self.hrefs_reality.
        '''
        self.link = 'https://reality.idnes.cz/s/pronajem/byty/praha/?page=1'
        self.request = requests.get(self.link)
        self.request.encoding='UTF-8'
        self.soup = BeautifulSoup(self.request.text,'lxml')
        self.page_reality = int(self.soup.findAll('a',{'class':'btn btn--border paging__item'})[-1].text) - 1
        self.hrefs_reality = ['https://reality.idnes.cz/s/pronajem/byty/praha/?page=' 
                                  + str(i) for i in range(1,self.page_reality)]
        self.soups = []
        self.counter = counter
        
    def get_data(self):
        '''
        Main method to obtain and transform the data. HTMLs are read from the JSON file and stored in a list (soup_list) 
        within this script. Next, the method iterates over the list, converts the strings in txt file into a BeautifulSoup
        object and parses the html for relevant data. At the end, a nested dictionary (dicts) is created and stored
        as a json file outside of this script.
        '''
        '''
        with open(fileDir + '\\Data\\reality_idnes_links.txt', 'r') as f:
            content = f.read()
        soup_list = content.split('BREAKHERE')
        '''
        with open(fileDir + '\\Data\\reality_idnes_links.json', 'r', encoding='utf-8') as f:
            content = json.load(f)
        soup_list = list(content.values())
        dicts = {}
        counter = 0
        for soup in soup_list:
            descrips = [] #empty list for apartment values
            values = [] #empty list for apartment prices
            info_size = []
            apartments = []
            vals = BeautifulSoup(soup,'lxml').findAll('p', {'class':'c-list-products__price'}) #parsing for apartment values
            for vl in vals: #adding values
                values.append(re.sub("[^0-9]", "",vl.find('strong').text))
                
            locs = BeautifulSoup(soup,'lxml').findAll('p', {'class':'c-list-products__info'})
            for i in locs: #adding location
                if 'Komercni sdeleni' in i.text:
                    continue
                else:
                    temp_info = str(i.text)
                    temp_info = re.sub(r'^(?:\\n)+','', temp_info).strip()[:-2]
                    temp_info = temp_info.strip().split(',')
                    temp_info = [i.strip() for i in temp_info]
                    if len(temp_info) == 1:
                        temp_info.append(temp_info[0])
                        temp_info[0] = 'NaN'
                    if len(temp_info) == 3:
                        del temp_info[2]
                    descrips.append(temp_info)
                    
            sizes = BeautifulSoup(soup,'lxml').findAll('h2', {'class':'c-list-products__title'})
            for s in sizes: #adding size and m2
                try:
                    item = s.text.split('bytu')[1].strip()[:-2]
                    temp = item.split(',')
                    temp[1] = temp[1][:-10].strip()
                    info_size.append(temp)
                except IndexError:
                    continue
            
            for apart in range(0,len(info_size)):
                apartments.append(info_size[apart] + descrips[apart] + [values[apart]])
                
            for item in apartments: #store apartment info, price into a dictionary and index by counter
                try:
                    dict = {}
                    dict['Size'] = item[0]
                    dict['m2'] = item[1]
                    dict['Street'] = deaccent(item[2]) #deaccent to provent potential errors
                    dict['District'] = deaccent(item[3])
                    dict['Base Price'] = int(item[4])
                    dict['Utilities Price'] = 0
                    dict['Total Price'] = int(item[4])
                    dict['Source'] = 'reality.idnes.cz'
                    dicts[self.counter] = dict
                    self.counter +=1
                except ValueError:
                    #counter += 1
                    continue
            print('Done loop number ' + str(self.counter) + '. Printing apartments.')
        with open(fileDir + '\\Data\\idnes_reality.json', 'w') as write_file: #store data into a json file
            json.dump(dicts, write_file, indent = 4)

In [None]:
b = DownloaderReality()

In [None]:
get_soups(b.hrefs_reality, fileDir + '\\Data\\reality_idnes_links.json')

In [None]:
b.get_data()

In [None]:
class DownloaderCeskeReality():
    def __init__(self):
        '''
        For the reality search, you need to iterate over search pages. Self.page_reality stores the maximum amount
        of pages and then via self.link a list of all pages from search is created in self.hrefs_reality.
        '''
        self.link = 'https://www.ceskereality.cz/pronajem/byty/praha/?strana=2'

        self.request = requests.get(self.link)
        self.request.encoding='UTF-8'
        self.soup = BeautifulSoup(self.request.text, 'html.parser')
        
        self.page_ceskereality = int([page.text for page in self.soup.findAll('ul',{'class':'pages'})[0]][-2]) - 1
        self.hrefs_reality = ['https://www.ceskereality.cz/pronajem/byty/praha/?strana=' 
                        + str(i) for i in range(1,self.page_ceskereality)]
        self.soups = []
        
    def get_soups(self):
        '''
        This method iterates over all search pages, converts them into a BeautifulSoup object and stores them in a txt file as 
        strings outside of this script. BREAKHERE is used to distinguish between objects. 
        '''
        for link in self.hrefs_reality[0:3]:
            sleep(random.uniform(0.5, 2))
            self.link = link
            self.request = requests.get(self.link)
            self.request.encoding='utf-8'
            self.soups.append(BeautifulSoup(self.request.text,'html.parser'))
            print('Page saved.')
            print(self.soups)
        with open('ceske_reality_links.txt', 'w') as f:
            for s in self.soups:
                f.write(str(deaccent(s).encode("utf-8")) + 'BREAKHERE')
            f.close
    
    def get_data(self):
        '''
        Main method to obtain and transform the data. HTMLs are read from the txt file and stored in a list (soup_list) 
        within this script. Next, the method iterates over the list, converts the strings in txt file into a BeautifulSoup
        object and parses the html for relevant data. At the end, a nested dictionary (dicts) is created and stored
        as a json file outside of this script.
        '''
        with open('ceske_reality_links.txt', 'r') as f:
            content = f.read()
        soup_list = content.split('BREAKHERE')
        dicts = {}
        #counter = 0
        for soup in soup_list[0:1]:
            descrips = [] #empty list for apartment values
            values = [] #empty list for apartment prices
            info_size = []
            apartments = []
            vals = BeautifulSoup(soup,'lxml').findAll('div', {'class':'cena'}) #parsing for apartment values
            for value in vals:
                values.append(re.sub("[^0-9]", "",value.text.split(',')[0]))
            locs = BeautifulSoup(soup,'lxml').findAll('div', {'class':'div_nemovitost suda'})
            for item in locs:
                print(item.text)
            #print(locs)

            '''
            for item in apartments: #store apartment info, price into a dictionary and index by counter
                try:
                    dict = {}
                    dict['Size'] = item[0]
                    dict['m2'] = item[1]
                    dict['Street'] = deaccent(item[2]) #deaccent to provent potential errors
                    dict['District'] = deaccent(item[3])
                    dict['Base Price'] = int(item[4])
                    dict['Utilities Price'] = 0
                    dict['Total Price'] = int(item[4]) 
                    dicts[counter] = dict
                    counter +=1
                except ValueError:
                    counter += 1
                    continue
            print('Done loop number ' + str(counter) + '. Printing apartments.')
        with open('idnes_reality.json', 'w') as write_file: #store data into a json file
            json.dump(dicts, write_file, indent = 4)      
            '''          

In [None]:
c = DownloaderCeskeReality()
c.get_soups()

In [None]:
c.get_data()

In [None]:
get_soups(c.hrefs_reality[0:2], 'blah.json')

## Combining the fetched data into one file

In [None]:
big_dict = []
data = {}
def data_combine(*args):
    #input example - 'idnes_reality.json', 'bezrealitky.json'
    for arg in args:
        with open(fileDir + '\\Data\\' + arg) as json_file:
            file_ = json.load(json_file)
            big_dict.append(file_)
    for dt in big_dict:
        data.update(dt)

data_combine('bezrealitky.json', 'idnes_reality.json')

In [None]:
def clean_dataframe(data_file):
    '''
    The clean_dataframe function takes a data file (here a dictoniary) as an input and returns a pandas dataframe, which is cleaned up and ready to use. 
    In particular, NaN values are replaced with nothing, white spaces before and after strings in the columns which have strings are removed 
    (which is important for the duplicate search), Rows which are duplicates (ergo same flat) are removed, the removal is executed based on the columns
    Size, m2, Street and Total Price as it is highly likely that in case each of these values is identical the flat is identical and a new column 'Address'
    is created which is necessary for geocoding.
    '''
    df = pd.DataFrame(data_file).T
    df = df.replace('NaN', '', regex=True)
    for name in ['Size','Street','District']: #strips all white spaces before and after strings
        df[name]=df[name].str.strip()
    print('Number of (removed) duplicates: ' + str(df.duplicated(['Size', 'm2', 'Street', 'Total Price']).sum()))
    df = df.drop_duplicates(['Size', 'm2', 'Street', 'Total Price'], ignore_index=True) #drops duplicates 
    df['Address'] = df['Street'] + ',' + df['District'] + ',' + 'Praha' #creating address column for geocoding
    return df

dataframe = clean_dataframe(data)
dataframe

## Geocoding

In [None]:
locator = geopy.Nominatim(user_agent='myGeocoder')
#locator = geopy.GoogleV3(api_key='AIzaSyDgWSTfwvVV3AELge6lJCw8hT0T4TwejYc')

In [None]:
#getting the GPS addresses
geocode = RateLimiter(locator.geocode, min_delay_seconds=1) #this process takes about 2,5 hours

'''We use Nominatim, an open source geocoding provider to retrive the locations (latitude, longitude, altitude) for our apartments. 
For this we provide Nominatim with the Addresses of the aparments.'''

dataframe['location'] = dataframe['Address'].apply(geocode)

dataframe['point'] = dataframe['location'].apply(lambda loc: tuple(loc.point) if loc else None)

dataframe[['latitude', 'longitude', 'altitude']] = pd.DataFrame(dataframe['point'].tolist(), index=dataframe.index)

dataframe = dataframe.dropna()
dataframe.to_pickle(fileDir + '\\Data\\' +'geo_df.pkl', protocol = 4)

In [6]:
dataframe = pd.read_pickle(fileDir + '\\Data\\' +'geo_df.pkl') #load from here


## Data Visualisation - Apartment locations

In [None]:
def FoliumMap(df_):
    '''
    Function creates a new, empty map with folium, the map doesnt contain any datapoints yet but is intialized at the mean latitude & longitude
    point in our dataset. Then adds data points = flats (markers) to the map. For each observation (row) of the dataset we read the latitude & longitude
    to create an icon which will be a display for the flat on the map. Furthermore we add a pop up text with basic information
    to each icon. We create clusters to achieve better visualisation.
    '''
    new_map = folium.Map(location=[df_['latitude'].mean(), df_['longitude'].mean()], 
                        zoom_start=12,
                        tiles='cartodbpositron')
    mc = MarkerCluster()# create empty cluster object
    for row in df_.itertuples():
     mc.add_child(folium.Marker(location=[row.latitude, row.longitude], #create markers and add to cluster
         popup= folium.Popup(
             folium.IFrame(
                 ('''Size: {Size} <br>
                  m2: {m2} <br>
                  Base Price: {bp} <br>
                  Utilities: {up} <br> 
                  Total Price: {TotalPrice}'''
                  ).format(Size=row.Size, m2=row.m2, bp=row[5], up=row[6], TotalPrice=row[7]),
                  width=200, height=100)),
         icon=folium.Icon(icon='home'))) #define icon symbol
    new_map.add_child(mc) 
    return new_map

FoliumMap(dataframe)

## Data Visualisations - Neighborhoods

In [292]:
class NeighborhoodsVisuals(): 
    def __init__(self, geo_filename='Praha.json', df = dataframe):
        '''
        NeighborhoodsVisuals takes a dataframe containing information about apartments and their geographic location and data (as a JSON file) 
        about the geographic location of the Neighborhoods of Prague as input and its methods combine this data, compute average values 
        for the neighborhoods and store them as a JSON file. Furthermore plots an interactive map of Prague if called. 
        '''
        with open(fileDir + '\\Data\\' + geo_filename, encoding="utf8") as data: 
                        hoods = json.loads(data.read()) 
        self.gdf = gpd.GeoDataFrame.from_features(hoods["features"])
        self.gdf_indv = gpd.GeoDataFrame(df, geometry = gpd.points_from_xy(df.longitude, df.latitude))
        self.avg_prices()
        self.store_merged()
        
        # This dictionary contains the formatting for the data in the plots
        format_data = [('Price', 10000, 25000,'0,0', 'Price'),
                      ('Median_price', 10000, 25000,'0,0', 'Median Price'),
                      ('Avg_m2', 180, 350,'0,0', 'Price per Square Metre')] #more options to be added later

        #Create a DataFrame object from the dictionary 
        format_df = pd.DataFrame(format_data, columns = ['field' , 'min_range', 'max_range' , 'format', 'verbage'])

        # Add hover tool, which will be used in the interactive neighborhood map
        hover = HoverTool(tooltips = [ ('Neighborhood','@NAZEV_MC'),
                                      ('Average Price', '@Price'),
                                      ('Median Price', '@Median_price'),
                                      ('Average m2', '@m2'),
                                      ('Price per Square Metre', '@Avg_m2')])
        
    def avg_prices(self):
        '''
        Joins individual apartment geographic-dataframe with geographic-dataframe of neighborhoods and then calculates summary statistics for neighborhoods. If called prints dataframe.
        '''
        df_ = gpd.sjoin(self.gdf_indv, self.gdf, op='within') 
        df_ = df_.loc(axis=1)['Size', 'm2', 'Street', 'District', 'Base Price', 'Address', 'location', 'latitude',
            'longitude', 'geometry', 'index_right', 'OBJECTID', 'PLOCHA', 'ID', 'NAZEV_MC',
            'KOD_MO', 'TID_TMMESTSKECASTI_P', 'NAZEV_1', 'Shape_Length', 'Shape_Area']
        df_["m2"] = df_['m2'].apply(pd.to_numeric)
        avg_price = df_.loc(axis=1)['NAZEV_MC','Base Price','m2'].groupby(['NAZEV_MC']).mean().round()
        avg_price['Median Price'] = df_.loc(axis=1)['NAZEV_MC','Base Price'].groupby(['NAZEV_MC']).median().round()
        avg_price['Crown per m2'] = (avg_price['Base Price'] / avg_price['m2']).round()
        avg_price.columns = ['Price', 'm2', 'Median_price', 'Avg_m2']
        return avg_price
    
    def store_merged(self):
        # Merge the GeoDataframe object with the neighborhood summary data (neighborhood)
        merged = pd.merge(self.gdf, self.avg_prices(), on='NAZEV_MC', how='left')
        merged = merged.fillna(value={'Price': 0, 'm2': 0})
        # Convert to json preferred string-like object 
        json_data = json.dumps(json.loads(merged.to_json()))
        with open(fileDir + '\\Data\\merged_visuals_data.json', 'w') as write_file: #store data into a json file
            json.dump(json_data, write_file, indent = 4)
 
    def make_plot(self, field_name='Price', color='Reds'):
        '''
        If called, plots map of Prague neighborhoods ranking in color the expensivness: options for field_name are {'Price': Mean prices of apartments, 
        'Median_price': Median prices of apartments, 'Avg_m2': Mean price per m2}. The color can be changed with the color argument (e.g. 'Reds', 'Greens', 'Blues') 
        '''
        with open(fileDir + '\\Data\\merged_visuals_data.json', 'r', encoding='utf-8') as f:
            json_data = json.load(f)
      
        geosource = GeoJSONDataSource(geojson = json_data)
        # Set the format of the colorbar
        min_range = format_df.loc[format_df['field'] == field_name, 'min_range'].iloc[0]
        max_range = format_df.loc[format_df['field'] == field_name, 'max_range'].iloc[0]
        field_format = format_df.loc[format_df['field'] == field_name, 'format'].iloc[0]

        #Define a sequential multi-hue color palette. Red color for Prague city color.
        palette = brewer[color][8]

        # Reverse color order so that dark red is highest obesity.
        palette = palette[::-1]

        # Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
        color_mapper = LinearColorMapper(palette = palette, low = min_range, high = max_range)

        # Create color bar.
        format_tick = NumeralTickFormatter(format=field_format)
        color_bar = ColorBar(color_mapper=color_mapper, label_standoff=18, formatter=format_tick,
        border_line_color=None, location = (0, 0))

        # Create figure object.
        verbage = format_df.loc[format_df['field'] == field_name, 'verbage'].iloc[0]

        p = figure(title = 'Apartment Rental ' + verbage + ' by City Parts in Prague', 
                 plot_height = 650, plot_width = 850,
                 toolbar_location = None)
        p.xgrid.grid_line_color = None
        p.ygrid.grid_line_color = None
        p.axis.visible = False

        # Add patch renderer to figure. 
        p.patches('xs','ys', source = geosource, fill_color = {'field' : field_name, 'transform' : color_mapper},
              line_color = 'black', line_width = 0.25, fill_alpha = 1)

        # Specify color bar layout.
        p.add_layout(color_bar, 'right')

        # Add the hover tool to the graph
        p.add_tools(hover)
      
        output_notebook()
        show(p)

        return 

In [293]:
NeighborhoodsVisuals()

<__main__.NeighborhoodsVisuals at 0x2dc70386970>

In [297]:
e.make_plot('Median_price', 'Reds')

In [270]:
# This dictionary contains the formatting for the data in the plots
format_data = [('Price', 10000, 25000,'0,0', 'Price'),
              ('Median_price', 10000, 25000,'0,0', 'Median Price'),
              ('Avg_m2', 180, 350,'0,0', 'Price per Square Metre')] #more options to be added later
 
#Create a DataFrame object from the dictionary 
format_df = pd.DataFrame(format_data, columns = ['field' , 'min_range', 'max_range' , 'format', 'verbage'])

# Add hover tool, which will be used in the interactive neighborhood map
hover = HoverTool(tooltips = [ ('Neighborhood','@NAZEV_MC'),
                              ('Average Price', '@Price'),
                              ('Median Price', '@Median_price'),
                              ('Average m2', '@m2'),
                              ('Price per Square Metre', '@Avg_m2')])

In [275]:
# Create a plotting function
def make_plot(field_name, color='Reds'):
      with open(fileDir + '\\Data\\merged_visuals_data.json', 'r', encoding='utf-8') as f:
            json_data = json.load(f)
      
      geosource = GeoJSONDataSource(geojson = json_data)
      # Set the format of the colorbar
      min_range = format_df.loc[format_df['field'] == field_name, 'min_range'].iloc[0]
      max_range = format_df.loc[format_df['field'] == field_name, 'max_range'].iloc[0]
      field_format = format_df.loc[format_df['field'] == field_name, 'format'].iloc[0]

      #Define a sequential multi-hue color palette. Red color for Prague city color.
      palette = brewer[color][8]

      # Reverse color order so that dark red is highest obesity.
      palette = palette[::-1]

      # Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
      color_mapper = LinearColorMapper(palette = palette, low = min_range, high = max_range)

      # Create color bar.
      format_tick = NumeralTickFormatter(format=field_format)
      color_bar = ColorBar(color_mapper=color_mapper, label_standoff=18, formatter=format_tick,
      border_line_color=None, location = (0, 0))

      # Create figure object.
      verbage = format_df.loc[format_df['field'] == field_name, 'verbage'].iloc[0]

      p = figure(title = 'Apartment Rental ' + verbage + ' by City Parts in Prague', 
                 plot_height = 650, plot_width = 850,
                 toolbar_location = None)
      p.xgrid.grid_line_color = None
      p.ygrid.grid_line_color = None
      p.axis.visible = False

      # Add patch renderer to figure. 
      p.patches('xs','ys', source = geosource, fill_color = {'field' : field_name, 'transform' : color_mapper},
              line_color = 'black', line_width = 0.25, fill_alpha = 1)

      # Specify color bar layout.
      p.add_layout(color_bar, 'right')

      # Add the hover tool to the graph
      p.add_tools(hover)
      
      output_notebook()
      show(p)

      return 

In [276]:
make_plot('Price')

In [272]:
# Define the callback function: update_plot
def update_plot(attr, old, new):
    # The input yr is the year selected from the slider
    #yr = slider.value
    new_data = json_data
    
    # The input cr is the criteria selected from the select box
    cr = select.value
    input_field = format_df.loc[format_df['verbage'] == cr, 'field'].iloc[0]
    
    # Update the plot based on the changed inputs
    p = make_plot(input_field)
    
    # Update the layout, clear the old document and display the new document
    layout = column(p, widgetbox(select))
    #layout = column(p, widgetbox(select), widgetbox(slider))
    curdoc().clear()
    curdoc().add_root(layout)
    
    # Update the data
    geosource.geojson = new_data


In [273]:
with open(fileDir + '\\Data\\merged_visuals_data.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)
# 
geosource = GeoJSONDataSource(geojson = json_data)
input_field = 'Price'

# Call the plotting function
p = make_plot(input_field)

# Make a selection object: select
select = Select(title='Select Criteria:', value='Price', options=['Price', 'Median Price',
                                                                               'Price per Square Metre'])
select.on_change('value', update_plot)

# Make a column layout of widgetbox(slider) and plot, and add it to the current document
# Display the current document
layout = column(p, widgetbox(select))

curdoc().add_root(layout)

output_notebook()
show(p)

#bokeh serve --show Downloader.ipynb -after streamlining the code for full functionality



## What needs to be done
1. Rerun geocoding for the dataframe which is without duplicates (so rerun the whole skript bsaically)
2. Fix relative path for files - my usage of MAC ducked up the file storing locations. This will be more complicated - we need to slice the code into multiple .py files and then import the functions/files from outside either as files or as libraries. Generally, we should have one .py file for Downloader, one for Geocoding, and one for Visualizer. All data (downloaded or created) should be stored within the project repository in a folder called Data. This was functioning on my PC but my MAC killed the relative path storage :/
3. Fix str(item) to r''
4. Streamline the code - what can be written as a function, should be a function
5. Maybe improve Class syntax - not necessary
6. Figure out a way how to store and load data consecutively - so we can introduce a slider into the graph where a person could see average prices across times of his choosing (not a priority)
    - for this, maybe look into SQL databases lecture
    - main idea - download data everyday, save them then based on the date of download. Right now the code is only a snapshot of any given time
9. Other data included in the interactive graph? Currently only price, median price, price/m2
10. Streamline/write better code anywhere where legit
11. Put the Visualizer.py on a web. It's quite easy to put it on a local server, simply by running bokeh serve --show Visualizer.ipynb. This is also a reason for slicing the code. 
12. Why does one get data fct , printing descrips and one printing apartments?
13. delete the comments in the visualisation part
14. How exactly got  Praha.json data - @PrahaOpenData could download from there?


Data output:
 - source_links.txt - file with htmls from real estate webs
 - source.json - parsed apartment data from htmls
 - geo_df.pkl - geocoded addresses
