In [42]:
import requests
import json
import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium import Choropleth, Circle, Marker, GeoJson
from folium.plugins import HeatMap, MarkerCluster
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load data of Saint-Petersburg
We will develop a pricing models that could be used to evaluate apartments located in the city of Saint Petersburg

In [43]:
df = pd.read_csv('/kaggle/input/russia-real-estate-20182021/all_v2.csv')
df = df[df['region'] == 2661]
df.head()

In [44]:
df.describe()

In [45]:
df.info()

## EDA

### Interactive map 
Interquartile range of 'Price' tells you the spread of the middle half of your distribution. Almost all of them are located along the main infrastructure of the city, in particular, the metro and coasts

In [46]:
# A web based data mining tool for OpenStreetMap. Wet get coordinates of metro stations
overpass_url = "https://maps.mail.ru/osm/tools/overpass/api//interpreter" 
overpass_query = """ 
[out:json];
area["ISO3166-2"="RU-SPE"][admin_level=4];
(node["station"="subway"](area);
 way["station"="subway"](area);
 rel["station"="subway"](area);
);
out center;
""" 
response = requests.get(overpass_url, 
                        params={'data': overpass_query})
data = response.json()

In [47]:
import numpy as np
import matplotlib.pyplot as plt
# Collect coords into list
coords = []
for element in data['elements']:
  if element['type'] == 'node':
    lon = element['lon']
    lat = element['lat']
    coords.append((lat, lon))

# Convert coordinates into numpy array
X = np.array(coords)
metro_df = pd.DataFrame(X, columns = ['lat','lon'])

In [48]:
%%time
# Select the first quantile
q1 = df['price'].quantile(.25)

# Select the third quantile
q3 = df['price'].quantile(.75)

# Create a mask inbeetween q1 & q3
mask = df['price'].between(q1, q3, inclusive=True)

# Filtering the initial dataframe with a mask
iqr = df[mask]

In [49]:
# Create map with release incidents and monitoring stations
m_2 = folium.Map(location=[59.942076, 30.328747], tiles='cartodbpositron', zoom_start=13)
for idx, row in metro_df.iterrows():
    Marker([row['lat'], row['lon']]).add_to(m_2)
    
    # Add a heatmap to the base map
HeatMap(data=iqr[['geo_lat', 'geo_lon']], radius=10).add_to(m_2)

mc = MarkerCluster()
for idx, row in iqr.iterrows():
    if not math.isnan(row['Long']) and not math.isnan(row['Lat']):
        mc.add_child(Marker([row['Lat'], row['Long']]))
m_3.add_child(mc)

# Show the map
m_2

### Distribution of feature

In [51]:
df = df.select_dtypes(exclude=['object'])
fig=plt.subplots(figsize=(18, 7), dpi=80)
i=0
for feature in df.columns: 
    if feature not in ['region']:
        i+=1
        plt.subplot(2, len(df.columns)//2, i)        
        sns.distplot(df[feature], color='green')
        ax = plt.gca()
        ax.set(xlabel=None)
        plt.title(feature)
    plt.tight_layout()

In [52]:
# fig=plt.subplots(figsize=(25, 7), dpi=80)
# i=0
# for feature in df.columns:
#     if feature not in ['region']:
#         i+=1
#         plt.subplot(3, len(df.columns)//2, i)
#         sns.scatterplot(df[feature], df['price'], color='green')
#         plt.tight_layout()