In [1]:
import time
import datetime
from dateutil.relativedelta import relativedelta
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'. Needed to remove SettingWithCopyWarning warning when assigning new value to dataframe column
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import seaborn as sns
import plotly.express as px

In [2]:
local_path = f'../housing_crawler/data/all_encoded.csv'
df = pd.read_csv(local_path).rename(columns={'WG_size':'capacity',
                                            'available from':'available_from',
                                            'available to':'available_to'})
df.head()

Unnamed: 0,id,url,type_offer,landlord_type,title,price_euros,size_sqm,available_rooms,capacity,available_spots_wg,...,diverse_flatmates,published_on,published_at,address,city,crawler,latitude,longitude,available_from,available_to
0,9523767,https://www.wg-gesucht.de/wohnungen-in-Berlin-...,3 Zimmer Wohnung,Verifiziert,Zentrumswohnung mit gehobener Ausstattung in B...,1790,79,3.0,0,0,...,0,08.08.2022,22.0,"Sybelstraße 49, Charlottenburg, Berlin",Berlin,WG-Gesucht,52.501882,13.303057,01.09.2022,31.12.2023
1,9526897,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-...,WG,Private,WG-Zimmer in 3er WG direkt am Volkspark Wilmer...,396,25,1.0,3,1,...,0,08.08.2022,23.0,"Livländische Str 2, Wilmersdorf, Berlin",Berlin,WG-Gesucht,52.482518,13.325919,01.09.2022,
2,9122485,https://www.wg-gesucht.de/1-zimmer-wohnungen-i...,1 Zimmer Wohnung,Private,!! Only Swap/ Nur Tausch !! Gut gelegene 1 Zim...,555,36,1.0,0,0,...,0,08.08.2022,23.0,"Boxhagener Kiez, Friedrichshain, Berlin",Berlin,WG-Gesucht,-1.0,-1.0,08.08.2022,
3,9526779,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-...,WG,Private,Schönes Zimmer in freundlicher WG am Park,350,9,1.0,2,1,...,0,08.08.2022,23.0,"Hochstraße, Gesundbrunnen, Berlin",Berlin,WG-Gesucht,52.548468,13.383361,01.09.2022,15.12.2022
4,5819435,https://www.wg-gesucht.de/1-zimmer-wohnungen-i...,1 Zimmer Wohnung,Private,1 Room Apartment Schillerkiez- Tempelhofer Fel...,837,41,1.0,0,0,...,0,08.08.2022,23.0,"Herrfurthplatz, Neukölln, Berlin",Berlin,WG-Gesucht,52.477091,13.422076,17.08.2022,31.08.2022


In [3]:
def prepare_data_types(ads_df = df):
    # Preapare data types
    ads_df['published_at'] = ads_df['published_at'].astype('Int64') # Int64 can take NaN while int or int64 won't
    ads_df['published_on'] = pd.to_datetime(ads_df['published_on'], format = "%d.%m.%Y")
    ads_df['available_from'] = pd.to_datetime(ads_df['available_from'], format = "%d.%m.%Y")
    ads_df['available_to'] = pd.to_datetime(ads_df['available_to'], format = "%d.%m.%Y")
    
    return ads_df

In [4]:
def create_standard_columns(ads_df = df):
    # Create day of the week column with first 3 letters of the day name
    ads_df['day_of_week_publication'] = ads_df['published_on'].dt.day_name()
    ads_df['day_of_week_publication'] = [day[0:3] for day in list(ads_df['day_of_week_publication'])]

    # Simplify type of offer to match searches at wg-gesuch.de
    ads_df['type_offer_simple'] = ['Single-room flat' if ('1 Zimmer Wohnung' in offer_type or '1 Zimmer Wohnung Wohnungen' in offer_type) else offer_type for offer_type in list(ads_df['type_offer'])]
    ads_df['type_offer_simple'] = ['Apartment' if ('Zimmer Wohnung' in offer_type) else offer_type for offer_type in list(ads_df['type_offer_simple'])]
    ads_df['type_offer_simple'] = ['Flatshare' if ('WG' in offer_type) else offer_type for offer_type in list(ads_df['type_offer_simple'])]
    ads_df['type_offer_simple'] = ['House' if ('Haus' in offer_type) else offer_type for offer_type in list(ads_df['type_offer_simple'])]
    
    # Create price/sqm column
    ads_df['price_per_sqm'] = round(ads_df['price_euros']/ads_df['size_sqm'],2)

    return ads_df

In [20]:
def filter_out_bad_entries(ads_df=df, country = 'Germany',
                           price_max = 4000, price_min = 50,
                          size_max = 400, size_min = 3,
                          date_max = None, date_min = None, date_format = "%d.%m.%Y"):
    
    try:
        # Filter ads in between desired dates. Standard is to use ads from previous 3 months
        if date_max == None or date_max == 'today':
            date_max = pd.to_datetime(time.strftime(date_format, time.localtime()), format = date_format)
        elif isinstance(date_max,str):
            date_max = pd.to_datetime(date_max, format = date_format)

        if date_min == None:
            date_min = datetime.date.today() + relativedelta(months=-3)
            date_min = pd.to_datetime(date_min.strftime(date_format), format = date_format)
        elif isinstance(date_min,str):
            date_min = pd.to_datetime(date_min, format = date_format)

        ads_df['temp_col'] = ads_df['published_on'].apply(lambda x: x >= date_min and x <= date_max)

        ads_df = ads_df[ads_df['temp_col']].drop(columns=['temp_col'])
    except ValueError:
        print('Date format was wrong. Please input a date in the format 31.12.2020 (day.month.year), or specify the date format you want to use using the "date_format" option.')

    
    ## Filter out unrealistic offers
    ads_df = ads_df.query(f'price_euros <= {price_max}\
                         & price_euros > {price_min}\
                         & size_sqm <= {size_max}\
                         & size_sqm >= {size_min}')
    
    if country.lower() in ['germany', 'de']:
        # Germany bounding box coordinates from here: https://gist.github.com/graydon/11198540
        ads_df['latitude'] = [lat if (lat>47.3024876979 and lat<54.983104153) else np.nan for lat in list(ads_df['latitude'])]
        ads_df['longitude'] = [lon if (lon>5.98865807458 and lon<15.0169958839) else np.nan for lon in list(ads_df['longitude'])]
    
    return ads_df

In [24]:
df = prepare_data_types(ads_df = df)
df = create_standard_columns(ads_df = df)
df = filter_out_bad_entries(ads_df=df, country = 'Germany',
                           price_max = 4000, price_min = 50,
                          size_max = 400, size_min = 3,
                          date_max = '01.08.2022', date_min = None, date_format = "%d.%m.%Y")
df = df.drop(columns=['type_offer', 'crawler', ])

In [25]:
df.head()

Unnamed: 0,id,url,type_offer,landlord_type,title,price_euros,size_sqm,available_rooms,capacity,available_spots_wg,...,address,city,crawler,latitude,longitude,available_from,available_to,day_of_week_publication,type_offer_simple,price_per_sqm
1088,9209523,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-...,WG,Private,WG in Lichtenberg,100,12,1.0,2,1,...,"Lincolnstrasse 2, Lichtenberg, Berlin",Berlin,WG-Gesucht,52.502073,13.50191,2022-08-05,2022-08-12,Mon,Flatshare,8.33
1089,9501176,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-...,WG,Private,Großes möbliertes sonniges WG Zimmer,550,20,1.0,2,2,...,"Friedrichsruher Str 37, Wilmersdorf, Berlin",Berlin,WG-Gesucht,52.488436,13.298692,2022-08-15,2023-07-31,Mon,Flatshare,27.5
1090,9508488,https://www.wg-gesucht.de/wohnungen-in-Berlin-...,2 Zimmer Wohnung,Private,Zwischenmiete in Schöneberg,710,56,2.0,0,0,...,"Elßholzstraße 19, Schöneberg, Berlin",Berlin,WG-Gesucht,52.491062,13.356278,2022-08-11,2022-09-04,Mon,Apartment,12.68
1091,9503156,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-...,WG,Private,Helles möbliertes WG Zimmer in Wilmersdorf,460,14,1.0,2,2,...,"Friedrichsruher Str 37, Wilmersdorf, Berlin",Berlin,WG-Gesucht,52.488436,13.298692,2022-08-15,2023-07-31,Mon,Flatshare,32.86
1092,9508601,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-...,WG,Private,"Schönes, großes und helles Zimmer in Altbauwoh...",700,25,1.0,2,1,...,"Torfstraße 17, Wedding, Berlin",Berlin,WG-Gesucht,52.541677,13.349815,2022-08-01,2022-08-28,Mon,Flatshare,28.0
