### <font color='green'> **1. Analyze the Primary Data**</font>


##### <br> The purpose of the analysis is to do an exploratory data analysis and identify the features which can be used to create features and use it for price prediction

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import requests
import math

import geopandas as gpd

import json

In [2]:
# enable correct rendering
alt.renderers.enable('default')

# uses intermediate json files to speed things up
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [3]:
### Read the cleaned data created by Michael
cleaned_data = pd.read_csv('./assets/cleaned_assets/df_cleaned.csv')

In [4]:
display(cleaned_data.head(5))

Unnamed: 0.1,Unnamed: 0,Borough,total_crimes,Population per square kilometre,Census population,Property price,id,neighbourhood_cleansed,latitude,longitude,...,minimum_nights,maximum_nights,number_of_reviews,price,amenities_count,closest_station_dist,within_1k_station,closest_POI_dist,within_5k_POI,dist_from_center
0,0,Barking and Dagenham,38988.0,6214.914626,185900,335683.0,198258,Barking and Dagenham,51.5343,0.08178,...,2,180,41,69.0,32,0.591797,1,7.895814,0,14.796863
1,1,Barking and Dagenham,38988.0,6214.914626,185900,335683.0,306405,Barking and Dagenham,51.54072,0.15246,...,4,365,3,41.0,6,0.399661,2,12.688768,0,19.730923
2,2,Barking and Dagenham,38988.0,6214.914626,185900,335683.0,2398015,Barking and Dagenham,51.5261,0.11898,...,3,1125,39,42.0,47,1.673583,0,9.872346,0,17.19108
3,3,Barking and Dagenham,38988.0,6214.914626,185900,335683.0,3322644,Barking and Dagenham,51.55499,0.15927,...,1,1125,2,48.0,8,1.264322,0,13.726785,0,20.542917
4,4,Barking and Dagenham,38988.0,6214.914626,185900,335683.0,3588725,Barking and Dagenham,51.54485,0.16265,...,2,1125,259,55.0,6,0.206356,1,13.52889,0,20.511536


In [5]:
cleaned_data.shape

(71912, 28)

In [6]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71912 entries, 0 to 71911
Data columns (total 28 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       71912 non-null  int64  
 1   Borough                          71912 non-null  object 
 2   total_crimes                     71912 non-null  float64
 3   Population per square kilometre  71912 non-null  float64
 4   Census population                71912 non-null  int64  
 5   Property price                   71912 non-null  float64
 6   id                               71912 non-null  int64  
 7   neighbourhood_cleansed           71912 non-null  object 
 8   latitude                         71912 non-null  float64
 9   longitude                        71912 non-null  float64
 10  host_since                       71912 non-null  object 
 11  host_is_superhost                71912 non-null  object 
 12  host_has_profile_p

#### <font color='green'> **1.1 Distribution of property price**</font>

In [7]:
cleaned_data.price.quantile([0.01,0.02,0.95,0.98,0.99,1])

0.01       20.0
0.02       24.0
0.95      489.0
0.98     1000.0
0.99     1570.0
1.00    53588.0
Name: price, dtype: float64

In [8]:
#### Distribution of the price. #1. Histogram + Median 
hist1 = alt.Chart(cleaned_data).mark_bar(color = '#ff5a5f').encode(
    x = alt.X('price:Q', title = 'Property Price', scale = alt.Scale(domain = [-5,20000])),
    y = alt.Y('count(id)', title = 'Number of properties'),
    tooltip = 'count(price)').properties(width = 300, height = 300)

median1 = alt.Chart(cleaned_data).mark_rule(color="#00a699", size = 2).encode(
    x="median(price)",
    tooltip = 'median(price)'

)

hist1 + median1

In [9]:
#### Distribution of the price. #1. Histogram + Median 
hist1 = alt.Chart(cleaned_data).mark_bar(color = '#ff5a5f').encode(
    x = alt.X('price:Q', title = 'Property Price', scale = alt.Scale(domain = [-5,20000])),
    y = alt.Y('count(id)', title = 'Number of properties'),
    tooltip = 'count(price)').properties(width = 300, height = 300)

median1 = alt.Chart(cleaned_data).mark_rule(color="#00a699", size = 2).encode(
    x="median(price)",
    tooltip = 'median(price)'

)

chart1 = hist1 + median1

#### Cumulative distribution of the price. Plotting on log scale

df = pd.DataFrame(cleaned_data['price'])
pop_density_chart = alt.Chart(df).transform_density(
    density='price',cumulative = True,
    as_=['price', 'density'],
    
).mark_area(color= "#ff5a5f", opacity = 0.7).encode(
    x=alt.X('price', axis=alt.Axis(title="Property price (Log Scale)"), scale = alt.Scale(type = 'log')),
    y=alt.Y('density:Q', axis=alt.Axis(title="Number of properties(cumulative %)", format = '%')),
        
)

pop_mean_chart = alt.Chart(df).mark_rule(color="#00a699", size = 1).encode(
    x="median(price)",
    
    
    
)
pop_95p = alt.Chart(df).transform_quantile(quantile = 'price',
                                           probs = [0.05, 0.95]).mark_rule(color = '#767676', size = 1).encode(
    x=alt.X('value:Q'),
    tooltip = 'value:Q',
    
    
    )

text1 = alt.Chart({'values':[{'x': 250, 'y': 0.5}]}).mark_text(
    text='95th percentile', angle=270, align = 'center', color = '#484848'
).encode(
    x='x:Q', y='y:Q'
)

text2 = alt.Chart({'values':[{'x': 65, 'y': 0.5}]}).mark_text(
    text='50th percentile', angle=270, align = 'center', color = '#484848'
).encode(
    x='x:Q', y='y:Q'
)

text3 = alt.Chart({'values':[{'x': 20, 'y': 0.5}]}).mark_text(
    text='5th percentile', angle=270, align = 'center', color = '#484848'
).encode(
    x='x:Q', y='y:Q'
)


chart2 = (pop_density_chart + pop_mean_chart+ pop_95p + text1+ text2 + text3)


##### Plot the final charts

alt.hconcat(chart1,chart2).configure_axis(grid = False).properties(
    title = 'Distibution of Airbnb prices').configure_concat(spacing = 1) 

#### <font color='green'> **1.2 Study correlation of price with different variables**</font>

#### <font color='green'> 1.2.1 Variables = Accomodation

In [10]:
#### Look at the distribution of each of the variables and cap and floor
varlist = ['accommodates', 'beds', 'bedrooms']

var1 = 'accommodates'
df = pd.DataFrame(cleaned_data[var1])

pop_density_chart1 = alt.Chart(df).transform_density(
density= var1,cumulative = True,
as_=[var1, 'density'],).mark_area(color= "#ff5a5f", opacity = 0.7).encode(
x=alt.X(var1),
y=alt.Y('density:Q', axis=alt.Axis(title="Number of properties(cumulative %)", format = '%'))
).properties(width = 250, height = 250)

df1 = pd.DataFrame(cleaned_data[var1].quantile([0.05,0.5,0.95])).reset_index()
df2 = pd.DataFrame({'text1': ['5th percentile', '50th percentile', '95th percentile']})
df3 = pd.concat([df1,df2], axis = 1)

pop_density_chart2 = alt.Chart(df3).mark_rule().encode(
                            x = var1,
                            color = alt.Color('text1', scale = alt.Scale(
                            domain = ['5th percentile', '50th percentile', '95th percentile'],
                            range = ['#767676','#00a699','#767676'])),
                            tooltip = var1
                        )
pop_density_chart_final = (pop_density_chart1+ pop_density_chart2).properties(title = '# of people accommodates distribution')
    
pop_density_chart_final

In [11]:
#### Cap and floor extreme values. 95th percentile is 7 
cleaned_data['accommodates_new'] = np.where(cleaned_data['accommodates'] > 7, 7,cleaned_data['accommodates'])


#### Plot Price against each of the above variables

#### Look at the distribution of each of the variables and cap and floor

for i in range(len(varlist)):
    var1 = varlist[i] 
       
    hist_chart1 = alt.Chart(cleaned_data).mark_bar(color = '#00A699', opacity = 0.9).encode(
        x = alt.X('accommodates_new:O'),
        y = alt.Y('median(price)')).properties(width = 250, height = 250)
    
hist_chart1

#### <font color='green'> 1.2.2 Variables = Room type

In [12]:
#### Distribution of different room types
df_roomtype = cleaned_data.groupby(['room_type']).agg({'id': 'count'}).reset_index().rename(columns = {'id': 'counter'})
tot = len(cleaned_data)

df_roomtype['percentage'] = round(df_roomtype['counter']/tot,3)

######################## 2A. Make a bar chart##################################
bars = alt.Chart(df_roomtype).mark_bar(size = 20, color = '#00A699' ).encode(
    x = alt.X('percentage:Q', axis = None),
    y = alt.Y('room_type:N',
              axis = alt.Axis(tickCount = 4, title = ''),
               sort = alt.EncodingSortField(
                                    field="percentage",  
                                    order="descending"
                                        )            
                 )
)

###################### 2B. Create a Text label################################## 
text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  ,
    color = '#FF5A5F'
).encode(
   text=alt.Text('percentage:Q',format='.1%')
)

(bars+text).configure_view(
            strokeWidth = 0).configure_scale(
            bandPaddingInner = 0.2).properties(width = 500, height = 180)

In [13]:
### How price varies with room type
grp_roomtype = cleaned_data.groupby('room_type').agg({'price': 'median', 'id': 'count'}).reset_index()
grp_roomtype


alt.Chart(grp_roomtype).mark_bar(size = 20, color = '#00A699' ).encode(
        x = alt.X('room_type', sort = alt.EncodingSortField(
                                    field="price",  
                                    order="descending"
                                        )),
        y = alt.Y('price'),
    tooltip = 'price').properties(width = 150, height = 300)

#### <font color='green'> 1.2.3 Variables = Number of Amenities

In [14]:
#### Does the number of aminities have any relation to price?
### Us ethe amenities_count field created by Michael

#### distribution of amenities_count

df = pd.DataFrame(cleaned_data['amenities_count'])
pop_density_chart3 = alt.Chart(df).transform_density(
    density='amenities_count',cumulative = True,
    as_=['amenities_count', 'density'],
    
).mark_area(color= "#ff5a5f", opacity = 0.7).encode(
    x=alt.X('amenities_count', axis=alt.Axis(title=" Number of amenities")),
    y=alt.Y('density:Q', axis=alt.Axis(title="Number of properties(cumulative %)", format = '%')),
        
)

pop_mean_chart = alt.Chart(df).mark_rule(color="#00a699", size = 1).encode(
    x="median(amenities_count)",
    
    
    
)
pop_95p = alt.Chart(df).transform_quantile(quantile = 'amenities_count',
                                           probs = [0.05, 0.95]).mark_rule(color = '#767676', size = 1).encode(
    x=alt.X('value:Q'),
    tooltip = 'value:Q',
    
    
    )

pop_density_chart3+ pop_mean_chart+ pop_95p

In [15]:
### cap the amenities count to the 95th percentile value
cleaned_data['amenities_count_new'] = np.where(cleaned_data['amenities_count'] > 50, 50,cleaned_data['amenities_count'])

In [16]:
cleaned_data.amenities_count_new.corr(cleaned_data.price)

0.06399702198279442

In [17]:
alt.Chart(cleaned_data).mark_point(filled = True, size = 60, color = '#FF5A5F').encode(
    x = alt.X('amenities_count_new'),
    y = alt.Y('median(price)')).properties(title = 'Price Vs Amenities count')

#### <font color='green'> 1.2.4 Variables = Borough

In [18]:
gdf = gpd.read_file('./assets/raw_assets/neighbourhoods.geojson')

cleaned_data_neigh = cleaned_data.groupby('Borough').agg({'price': 'median'}).reset_index().rename(columns = {'price': 'median_price'})
neig_final = pd.merge(left = gdf, right = cleaned_data_neigh,
                      left_on = 'neighbourhood', right_on = 'Borough', how = 'inner') 

In [19]:
### convert the merged data back to json file and extract the features section from the Json
choro_json = json.loads(neig_final.to_json())
choro_data = alt.Data(values=choro_json['features'])

In [20]:
choro_chart1 = alt.Chart(choro_data).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
    color = alt.Color('properties.median_price:Q', scale=alt.Scale(scheme='yellowgreenblue')),
    tooltip = ["properties.neighbourhood:N",'properties.median_price:Q']
    )

choro_chart1.properties(width = 500, height = 500)

#### <font color='green'> 1.2.5 Variables = Distance from London Underground Station

In [21]:
### Reading the London Tube data from Wikipedia
tube1 = pd.read_html('https://commons.wikimedia.org/wiki/London_Underground_geographic_maps/Tables')

In [22]:
tube_data = tube1[0]

In [23]:
tube_data.head()

Unnamed: 0,id,latitude,longitude,name,display_name,zone,total_lines,rail
0,1,51.5028,-0.2801,Acton Town,Acton<br />Town,3.0,2,0
1,2,51.5143,-0.0755,Aldgate,,1.0,2,0
2,3,51.5154,-0.0726,Aldgate East,Aldgate<br />East,1.0,2,0
3,4,51.5107,-0.013,All Saints,All<br />Saints,2.0,1,0
4,5,51.5407,-0.2997,Alperton,,4.0,1,0


In [24]:
#### Plot the London underground stations on the map
choro_background = alt.Chart(choro_data).mark_geoshape(
        stroke='black',
        fill = '#00A699', opacity = 0.2,
        strokeWidth=1
    )

points2 = alt.Chart(tube_data).mark_point(filled = True, size = 50, color = '#FF5A5F', opacity = 1).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    tooltip='name'
)

(choro_background + points2).properties(width = 800, height = 800, title = 'London Tube Stations')

In [25]:
#### Lets see how property prices vary with closest tube station distance
cleaned_data.closest_station_dist.quantile([0.05,0.1,0.5,0.75,0.9,0.95,1])

0.05     0.136951
0.10     0.197444
0.50     0.592711
0.75     1.262613
0.90     2.538226
0.95     4.186837
1.00    19.797056
Name: closest_station_dist, dtype: float64

In [26]:
#### Create bins on the percentiles
cleaned_data['perc_tube_dist_bin'] = pd.cut(cleaned_data['closest_station_dist'], bins=[0,0.137,0.198,0.593,1.263,2.538,4.187,float('Inf')], labels=['1. <5perc', '2. 5-10perc', '3. 10-50perc','4. 50-75perc','5. 75-90perc','6. 90-95perc','7. >95perc'])


In [27]:
### How price varies with room type
grp_tubedist = cleaned_data.groupby('perc_tube_dist_bin').agg({'price': 'median', 'id': 'count'}).reset_index()
grp_tubedist


alt.Chart(grp_tubedist).mark_bar(size = 20, color = '#FF5A5F' ).encode(
        x = alt.X('perc_tube_dist_bin'),
        y = alt.Y('price'),
    tooltip = 'price').properties(width = 250, height = 300, title = 'Price Vs distance from tube station')

#### <font color='green'> 1.2.6 Variables = Distance from Tourist destinations

In [28]:
places = pd.read_csv('./assets/cleaned_assets/visitor.csv', encoding='windows-1252')

In [29]:
choro_background = alt.Chart(choro_data).mark_geoshape(
        stroke='black',
        fill = '#767676', opacity = 0.5,
        strokeWidth=1
    )


points3 = alt.Chart(places).mark_point(filled = True, size = 100, color = 'red').encode(
    longitude='Longitude:Q',
    latitude='Latitude:Q',
    tooltip='Site'
)

(choro_background + points3).properties(width = 700, height = 700)

In [30]:
cleaned_data.closest_POI_dist.quantile([0.1,0.25,0.5,0.75,0.9, 0.95, 1])

0.10     0.621942
0.25     1.095666
0.50     2.177274
0.75     3.943268
0.90     6.622513
0.95     8.598491
1.00    21.911856
Name: closest_POI_dist, dtype: float64

In [31]:
cleaned_data['closest_POI_dist1'] = cleaned_data['closest_POI_dist'].apply(lambda x: round(x, 1))


In [32]:
cleaned_data['closest_POI_dist2'] = np.where(cleaned_data['closest_POI_dist1'] > 8.5, 8.5 , cleaned_data['closest_POI_dist1'] )

alt.Chart(cleaned_data).mark_point(filled = True, color="#00a699").encode(
    x = alt.X('closest_POI_dist2'),
    y = alt.Y('median(price)'),
    tooltip = 'median(price)').properties(title = 'Price vs Distiance from a tourist attraction')

#### <font color='green'> 1.2.7 Variables = Distance from City Center

In [33]:
cleaned_data['dist_from_center1'] = cleaned_data['dist_from_center'].apply(lambda x: round(x, 1))
cleaned_data.dist_from_center.quantile([0.1,0.25,0.5,0.75,0.9, 0.95, 1])

0.10     2.585173
0.25     4.192520
0.50     6.206800
0.75     9.720720
0.90    14.062630
0.95    16.642627
1.00    29.209746
Name: dist_from_center, dtype: float64

In [34]:


cleaned_data['dist_from_center2'] = np.where(cleaned_data['dist_from_center1'] > 16.6, 16.6 , cleaned_data['closest_POI_dist1'] )

alt.Chart(cleaned_data).mark_point(filled = True, color="#00a699").encode(
    x = alt.X('dist_from_center2'),
    y = alt.Y('median(price)'),
    tooltip = 'median(price)').properties(title = 'Price vs Distance from city center')



#### <font color='green'> 1.2.8 Variables = Property rates

In [35]:
property1 = pd.read_csv('./assets/raw_assets/propertyprices.csv', encoding='windows-1252')


property1['price_new'] = property1['Oct_2021 (in GBP)'].replace({'\xa3': '', ',': ''}, regex=True).astype(float)

property1['London borough'] = np.where(property1['London borough'] == 'City of Westminster', 'Westminster', property1['London borough'])

prop_final = pd.merge(left = gdf, right = property1,
                      left_on = 'neighbourhood', right_on = 'London borough', how = 'inner') 


### convert the merged data back to json file and extract the features section from the Json
choro_json = json.loads(prop_final.to_json())
choro_data = alt.Data(values=choro_json['features'])

choro_chart4 = alt.Chart(choro_data).mark_geoshape(
        stroke='black',
        strokeWidth=1
    ).encode(
    color = alt.Color('properties.price_new:Q', scale=alt.Scale(scheme='yellowgreenblue')),
    tooltip = ["properties.neighbourhood:N",'properties.price_new:Q']
    )

choro_chart4

In [36]:
cleaned_data1 = pd.merge(left = cleaned_data, right = property1, left_on = 'Borough', right_on = 'London borough', 
                    how = 'inner' )

alt.Chart(cleaned_data1).mark_point(filled = True, color = 'red', size = 60).encode(
    x = alt.X('price_new', scale = alt.Scale(domain = [300000,1500000]), title = 'House_price'),
    y = alt.Y('median(price)', title = 'Airbnb median price'))