In [2]:
import re
import time
import datetime
from dateutil.relativedelta import relativedelta
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'. Needed to remove SettingWithCopyWarning warning when assigning new value to dataframe column
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import seaborn as sns
import plotly.express as px

import statsmodels.formula.api as smf
import scipy.stats as stats

%load_ext autoreload
%autoreload 2

from housing_crawler.utils import save_file, get_file, crawl_ind_ad_page
from housing_crawler.analysis.ads_table_processing import prepare_data, filter_out_bad_entries, transform_columns_into_numerical, hot_encode_columns, feature_engineering


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
all_ads = get_file(file_name='all_encoded.csv', local_file_path=f'housing_crawler/data')

df_processed = prepare_data(ads_df = all_ads)
df_processed = filter_out_bad_entries(ads_df = df_processed, country = 'Germany',
                       price_max = 6000, price_min = 50,
                      size_max = 400, size_min = 3,
                      date_max = None, date_min = None, date_format = "%d.%m.%Y")
df_processed = transform_columns_into_numerical(ads_df = df_processed)
# df_processed = hot_encode_columns(ads_df = df_processed)
df_processed = feature_engineering(ads_df = df_processed)

df_processed = df_processed.drop(columns=['details_searched', 'cold_rent_euros',
       'mandatory_costs_euros', 'extra_costs_euros', 'transfer_costs_euros',
       'deposit', 'zip_code', 'home_total_size', 'smoking', 'wg_type',
       'age_range', 'gender_search', 'energy', 'wg_possible', 'building_type',
       'building_floor', 'furniture', 'kitchen', 'shower_type', 'tv',
       'floor_type', 'heating', 'public_transport_distance', 'internet',
       'parking', 'schufa_needed', 'extras_waschmaschine',
       'extras_spuelmaschine', 'extras_terrasse', 'extras_balkon',
       'extras_garten', 'extras_gartenmitbenutzung', 'extras_keller',
       'extras_aufzug', 'extras_haustiere', 'extras_fahrradkeller',
       'extras_dachboden', 'languages_deutsch', 'languages_englisch',
       'number_languages', 'min_age_flatmates', 'max_age_flatmates'])

===> Loaded all_encoded.csv locally


In [47]:
df_processed.columns

Index(['id', 'url', 'landlord_type', 'title', 'price_euros', 'size_sqm',
       'available_rooms', 'capacity', 'available_spots_wg', 'male_flatmates',
       'female_flatmates', 'diverse_flatmates', 'published_on', 'published_at',
       'address', 'city', 'crawler', 'latitude', 'longitude', 'available_from',
       'available_to', 'type_offer_simple', 'day_of_week_publication',
       'price_per_sqm', 'days_available'],
      dtype='object')

In [50]:
df_processed[['url', 'type_offer_simple']].groupby('type_offer_simple').count().rename(columns={'url':'count'}).sort_values(by = ['count'], ascending=False)

Unnamed: 0_level_0,count
type_offer_simple,Unnamed: 1_level_1
WG,37147
Apartment,6810
Single-room flat,6337
House,107


In [48]:
## Filter type of offer
wg_df = df_processed.query('type_offer_simple == "WG"\
                     & price_euros <= 2000\
                     & price_euros > 50\
                     & size_sqm <= 60\
                     & size_sqm >= 3').reset_index().drop(columns=['index'])

flathouse_df = df_processed.query('type_offer_simple == "Apartment" | type_offer_simple == "House" \
                     & price_euros > 100').reset_index().drop(columns=['index'])

notwg_df = df_processed.query('type_offer_simple != "Flatshare"\
                     & price_euros > 100').reset_index().drop(columns=['index'])