In [2]:
import os
import sys

In [3]:
import pandas as pd
# import s3fs
pd.set_option('display.max_colwidth', -1)
sys.path.append(os.path.join(os.getcwd(), '..'))
from aws_utils import read_csv_as_dataframe

# Read and clean input

In [4]:
def read_data_apartments(website, location):
    BUCKET = f's3://data-apartments/production/{website}/{location}/'
    df = read_csv_as_dataframe(bucket_path=BUCKET,
                               filename=f'{website}_db.csv',
                               timestamp_column_name='downloaded')
    df['website'] = website
    df['city'] = location
    return df

df = (read_data_apartments('olx', 'warszawa')
          .append(read_data_apartments('otodom', 'warszawa'), sort=False)
          .append(read_data_apartments('gumtree', 'warszawa'), sort=False))


EndpointConnectionError: Could not connect to the endpoint URL: "https://data-apartments.s3.None.amazonaws.com/?list-type=2&prefix=production%2Folx%2Fwarszawa%2F&delimiter=%2F&encoding-type=url"

In [None]:
# Remove instances where coordinates were not recorded
df.drop('tracking_id', axis=1, inplace=True)
df.dropna(subset=['latitude', 'longitude', 'owner', 'rooms', 'area'], inplace=True)
print(df.isnull().sum().sum())
d = df.copy()

In [None]:
df.tail()

## Map rooms to numerical value

In [None]:
d.rooms = pd.to_numeric(d.rooms, errors='ignore', downcast='integer')
rooms = {
    "Kawalerka lub garsoniera": 1,
    "4 pokoje": 4,
    "6 lub więcej pokoi": 6,
    "5 pokoi": 5,
    "2 pokoje": 2,
    "3 pokoje": 3,
}
d.rooms.replace(rooms, inplace=True)
d.rooms = d.rooms.astype(int)

## Organize owner column

In [None]:
owner = {
    'Osoby prywatnej': 'private',
    'Biuro / Deweloper': 'agency',
    'Oferta biura nieruchomości': 'agency',
    'Oferta prywatna': 'private',
    'Agencja': 'agency',
    'Właściciel': 'agency',
}

d.owner = d.owner.map(owner)
d.owner.unique()

## Calculate distance from center in degrees

In [None]:
d.head()

In [None]:
center_latitude = 52.229719
center_longitude = 21.011381
d['latitude_from_center'] = (center_latitude - d['latitude']).abs()
d['longitude_from_center'] = (center_longitude - d['longitude']).abs()

## Map districts

In [None]:
districts = ['Mokotów', 'Ursus', 'Bemowo', 'Śródmieście', 'Wilanów', 'Targówek', 'Ochota', 'Białołęka', 'Ursynów',
             'Włochy', 'Wawer', 'Wola', 'Bielany', 'Wesoła', 'Żoliborz', 'Rembertów', 'Praga-Północ', 'Praga-Południe']

districts_map = {
    'Praga Południe': 'Praga-Południe',
    'Praga Północ': 'Praga-Północ',
    'Centrum': 'Śródmieście',
}

d.district.replace(districts_map, inplace=True)
d.query('district in @districts', inplace=True)
d.shape

### Encode districts

In [None]:
d = pd.concat([d, pd.get_dummies(d['district'], prefix='district')], axis=1)
d = pd.concat([d, pd.get_dummies(d['owner'], prefix='owner')], axis=1)

## Remove outliers/ quirks

In [None]:
d.query('price < 30000000', inplace=True)
d.query('price_per_m < 1000000', inplace=True)
d.query('area < 1000', inplace=True)

## Select apropriate columns

In [None]:
columns = [c for c in d.columns if 'owner_' in c]
X = d.loc[:, columns + ['area', 'rooms', 'price', 'latitude_from_center', 'longitude_from_center']]

# Start clustering

## Determine number of clusters

In [None]:
inertias = []
clus_num = []
for k in range(10,30):
    scaler = StandardScaler()
    scaler.fit(X)
    x = scaler.transform(X)
    kmeans = KMeans(n_clusters=k)
    y_pred = kmeans.fit_predict(x)
    clus_num.append(k)
    inertias.append(kmeans.inertia_)
    
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(x=clus_num, y=inertias, mode='markers'))
fig.show()
# 19 clusters it is

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
x = scaler.transform(X)
k = 29
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(x)
clusters = X[['price', 'area', 'rooms']].copy()
clusters['cluster'] = y_pred

In [None]:
d = d.assign(cluster=y_pred)

In [None]:
# Assign district numerical value
district_num = {l:k for k,l in enumerate(d.district.unique())}
d['district_num'] = d.district.map(district_num)

In [None]:
d.head()

In [None]:
fig =px.scatter(d, x="price_per_m", y="district", color="cluster",
              hover_data=['rooms', 'url', 'item_id', 'website'])
fig.show()

In [None]:
fig =px.scatter(d, x="latitude_from_center", y="longitude_from_center", color="cluster",
              hover_data=['rooms', 'url', 'item_id', 'website'])
fig.show()

In [None]:
fig =px.scatter(d, x="rooms", y="cluster", color="owner",
              hover_data=['rooms', 'url', 'item_id', 'website'])
fig.show()

In [None]:
pd.options.display.max_rows = 9999
d.groupby(by=['cluster']).agg({'price_per_m': ['mean', 'std', 'count'],
                                                    'price': ['mean', 'std'],
                                                    'rooms': ['median'],
                                                    'latitude': ['mean'],
                                                    'longitude': ['mean']
                                                    })