In [637]:
import os
import re
import json
from sklearn.preprocessing import MultiLabelBinarizer 
import itertools
import pandas as pd
import numpy as np
import webbrowser
from datetime import date

In [735]:
#--- basic cleaning ---#
def empty_list_to_string(x):
    if (isinstance(x,list) and len(x)==0):
        return ""
    else:
        return x
    
def remove_unit_signs(x):
    return re.sub("\s*m²|\s*€","",x)

def remove_thousand_dot(x):
    return re.sub("\.(?=[0-9]{3})","",x)

def replace_commas(x):
    return re.sub(",(?=[0-9]{1,2})",".",x)

def remove_rows_without_price_or_zip(df):
    tmpDf = df[["preis","plz"]]
    nrowComplete = df.shape[0]
    mask = tmpDf.preis.notna() & (df.plz!="")
    nrowClean = mask.sum()
    df = df.loc[mask]
    print(nrowComplete-nrowClean," rows without price and zip data were deleted")
    return df

def clean_df(df):
    df = df.applymap(empty_list_to_string)
    unitSignsList = ["preis", "wohnflaeche","grundstuecksflaeche"]
    df[unitSignsList] = (df[unitSignsList]
                         .applymap(remove_unit_signs)
                         .applymap(remove_thousand_dot)
                         )
    return df

#--- clean zip and place ---#
def get_zip_and_place(df):
    def funk(x):
        try:
            x = re.findall("\d{5}",x)[0] 
        except:
            x = ""
        return x
    df["plz"] = df.ort.apply(funk)

    df["ortsname"] = df.ort.apply(lambda x: re.sub("\d{5}","",x) if len(x)>0 else x)

    df = df.drop(["ort"],axis=1) 
    return df

#--- get binarized columns of characteristics ---#
def clean_merkmale(ser):
    ser = ser.apply(lambda x: re.sub('^, ','',x) if x is not None else x)#comma at the start
    ser = ser.apply(lambda x: re.sub(',$','',x) if x is not None else x)# comma at the end

    ser = ser.apply(lambda x: x.split(',') if x is not None else x) 
    ser = ser.apply(lambda x: [re.sub('^\s','',a) for a in x] if x is not None else x)#eliminate whitespace
    ser = ser.apply(lambda x: ["Keine Angabe"] if len(x)==1 else x)#set value for empty list
    print("merkmale cleaned and put into list")
    return ser

def binarize_merkmale(df):
    mlb = MultiLabelBinarizer()
    df["merkmale"] = clean_merkmale(df["merkmale"])
    dfMerkmale = pd.DataFrame(mlb.fit_transform(df.merkmale)
                             ,columns=mlb.classes_
                             ,index=df.index)
    df = pd.concat([df,dfMerkmale],axis=1)
    df = df.drop(['merkmale'],axis=1)
    print("merkmale binarized")
    return df

def set_key_value_as_index(df):
    df["id"] = df.url.apply(lambda x: re.findall(r"\w+",x)[-1])
    df = df.set_index("id",drop=True)
    return df

def drop_unprepared(df):
    """
    drops columns containing secondary information that have not yet been prepared
    """
    df = df.drop(["title","url","weitere_eigenschaften","beschreibung"],axis=1)
    return df



def prepare_price(df):
    """
    replaces commas with dots, replaces "auf Anfrage" (=on request) with empty string, 
    strips leading or trailing blanks and converts to float64
    
    """
    if df.preis.dtypes != "float64":
        tmp_preis = df.preis
        tmp_preis = tmp_preis.str.replace('auf Anfrage\xa0','')
        tmp_preis = tmp_preis.str.strip()
        tmp_preis = tmp_preis.apply(replace_commas)
        df["preis"] = pd.to_numeric(tmp_preis)
        print("preis has been cleaned up and converted to float")
    else: 
        print("preis is already float")
    return df

def clean_numerical_cols(df):
    numCols = ["anzahl_raeume","wohnflaeche","grundstuecksflaeche"]
    df[numCols] = df[numCols].applymap(replace_commas).applymap(pd.to_numeric)
    print("numerical columns:",', '.join(numCols),"cleaned")
    return df

def unpack_list_elements(df):
    """
    some columns' elements are lists. This is incompatible with eg drop_duplicates().
    This file unpacks them as str
    """
    listCols = ["weitere_eigenschaften","beschreibung"]
    df[listCols] = df[listCols].applymap(lambda x:", ".join(x))
    return df

# --- loading --- #

def extract_metadata_from_filename(jsonFileName):
    tmpDictCols = {}
    tmp = jsonFileName.split("-")
    tmpDictCols["transaktionsArt"] = tmp[-1].split(".")[0]
    tmpDictCols["objektArt"] = tmp[-2]
    tmpDictCols["datumDownload"] = "-".join(tmp[:3])
    tmpDictCols["suchOrt"] = "-".join(tmp[3:-2])
    return tmpDictCols

def insert_meta_data_columns(dfTmp,jsonFileName):
    """
    The json files do not contain metadata. They are read from the filename,
    prepared by extract_meta_data_from_filename and put into columns at the 
    start of the dataframe
    
    input:
        - df 
        - jsonFileName (str)
        
    output:
        - df
    """
    tmpDictCols = extract_metadata_from_filename(jsonFileName)
    for colNum, name in enumerate(tmpDictCols.keys()):
        dfTmp.insert(loc=colNum, column=name, value = tmpDictCols[name])
    return dfTmp

def get_pathdata_and_listfilenames(location="notebook"):
    """
    creates path to raw data in json format, and creates a generator with
    [path]/[filename]
    
    Paramaters:
        location (str): "notebook" if this file is run from notebook folder, 
                        if any other string if run from main folder of project.
    
    Returns:
        pathFile (generator)
    """
    pathData = os.path.join("..","data")
    jsonFilesList = [a for a in os.listdir(pathData) if re.findall("json$",a)]
    for fileName in jsonFilesList:
        pathFile = os.path.abspath(os.path.join(pathData,fileName))
        yield pathFile, fileName



def load_data(location="notebook"):
    """
    loads the data in data folder as json into pandas and adds metadata columns 
    derived from filenames
    
        Parameters:
            location (str): "notebook" if this file is run from notebook folder, 
                            if any other string if run from main folder of project.
            
        Results:
            df
    """
    
    dfList = []
    for pathFile, nameFile in get_pathdata_and_listfilenames():
        #print(nameFile)
        with open(pathFile) as data_file:    
            data = json.load(data_file)
        dfTmp = pd.json_normalize(data,['objects'])
        dfTmp = insert_meta_data_columns(dfTmp,nameFile)
        dfList.append(dfTmp)
    print(len(dfList),f' json-files have been loaded')
    df = pd.concat(dfList)
   
    return df

def load_and_prepare_data():
    df = load_data()
    df = set_key_value_as_index(df)

    df = clean_df(df)
    df = get_zip_and_place(df)
    df = remove_rows_without_price_or_zip(df)

    df = binarize_merkmale(df)
    df = unpack_list_elements(df)
    df = prepare_price(df)
    df["datumDownload"] = pd.to_datetime(df.datumDownload)    
    df = clean_numerical_cols(df)
    df = df.drop_duplicates()
    
    return df


In [736]:
df = load_and_prepare_data()

22  json-files have been loaded
373  rows without price and zip data were deleted
merkmale cleaned and put into list
merkmale binarized
preis has been cleaned up and converted to float
numerical columns: anzahl_raeume, wohnflaeche, grundstuecksflaeche cleaned


In [737]:
df[df.suchOrt=="berlin"].shape

(4285, 50)

In [536]:
tmpInd = "2YM364K".lower()
tmpInd

'2ym364k'

In [537]:
df.loc["2YM364K".lower()]

Unnamed: 0_level_0,transaktionsArt,objektArt,datumDownload,suchOrt,url,title,preis,anzahl_raeume,wohnflaeche,grundstuecksflaeche,...,frei,provisionsfrei,renoviert,rollstuhlgerecht,saniert,seniorengerechtes Wohnen,teilweise klimatisiert,teilweise unterkellert,voll klimatisiert,voll unterkellert
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2ym364k,kaufen,haus,2021-04-13,norderstedt,https://www.immowelt.de/expose/2ym364k,Niendorf . Neubau . Exklusives Einfamilienhaus...,997000.0,5.0,133.0,555.0,...,1,0,0,0,0,0,0,0,0,0
2ym364k,kaufen,haus,2021-03-13,norderstedt,https://www.immowelt.de/expose/2ym364k,Niendorf . Neubau . Exklusives Einfamilienhaus...,997000.0,5.0,133.0,555.0,...,1,0,0,0,0,0,0,0,0,0


In [538]:
date.today()

datetime.date(2021, 4, 13)

In [540]:
interessanteCols = ["preis","anzahl_raeume","wohnflaeche","transaktionsArt","url","datumDownload","plz"]


In [541]:
mask = ((df["anzahl_raeume"]>=4) 
        & (df["wohnflaeche"]>100) 
        & (df["transaktionsArt"]=="mieten") 
        & (df["preis"]<1500)
        & (df["datumDownload"]==pd.to_datetime('today').normalize())
        & (df["plz"]!="24568")
       )
(df.loc[mask,interessanteCols].sort_values(by="preis"))

Unnamed: 0_level_0,preis,anzahl_raeume,wohnflaeche,transaktionsArt,url,datumDownload,plz
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2yacz4x,958.0,5.0,106.5,mieten,https://www.immowelt.de/expose/2yacz4x,2021-04-13,22179
2xp7g46,1120.0,4.0,101.98,mieten,https://www.immowelt.de/expose/2xp7g46,2021-04-13,25451
2z3nm43,1200.0,4.0,118.0,mieten,https://www.immowelt.de/expose/2z3nm43,2021-04-13,22391
2y9jk4l,1365.73,4.0,117.25,mieten,https://www.immowelt.de/expose/2y9jk4l,2021-04-13,22417
2z3uq42,1368.45,4.0,120.89,mieten,https://www.immowelt.de/expose/2z3uq42,2021-04-13,22179
2zrg942,1390.0,4.0,109.3,mieten,https://www.immowelt.de/expose/2zrg942,2021-04-13,22846
2yjxk4z,1400.0,5.0,110.0,mieten,https://www.immowelt.de/expose/2yjxk4z,2021-04-13,22844
2yqwr4u,1430.0,4.0,102.08,mieten,https://www.immowelt.de/expose/2yqwr4u,2021-04-13,22359
2yk8k4s,1450.0,4.0,118.0,mieten,https://www.immowelt.de/expose/2yk8k4s,2021-04-13,22145
2xw794y,1490.0,4.0,134.67,mieten,https://www.immowelt.de/expose/2xw794y,2021-04-13,22844


In [487]:
mask = ((df["anzahl_raeume"]>=1) 
        & (df["wohnflaeche"]>10) 
        & (df["transaktionsArt"]=="kaufen") 
        & (df["preis"]<500000)
        & (df["datumDownload"]==pd.to_datetime('today').normalize())
        & (df["plz"]!="24568")
        & (df["plz"]!="29493")
       )
subsel = (df.loc[mask,interessanteCols]
 .sort_values(by=["preis","anzahl_raeume"])
)
subsel

Unnamed: 0_level_0,preis,anzahl_raeume,wohnflaeche,transaktionsArt,url,datumDownload,plz
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2xxr64w,30000.0,4.0,101.0,kaufen,https://www.immowelt.de/expose/2xxr64w,2021-04-13,19357
2z6xq43,35000.0,4.0,157.0,kaufen,https://www.immowelt.de/expose/2z6xq43,2021-04-13,19370
2zrtq43,37000.0,4.0,112.0,kaufen,https://www.immowelt.de/expose/2zrtq43,2021-04-13,19372
2x7lj42,51500.0,6.0,153.0,kaufen,https://www.immowelt.de/expose/2x7lj42,2021-04-13,19309
2xm3m4p,64600.0,6.0,160.0,kaufen,https://www.immowelt.de/expose/2xm3m4p,2021-04-13,90841
...,...,...,...,...,...,...,...
2y5gm4l,499000.0,4.0,106.0,kaufen,https://www.immowelt.de/expose/2y5gm4l,2021-04-13,22145
2yhew4r,499000.0,5.0,118.0,kaufen,https://www.immowelt.de/expose/2yhew4r,2021-04-13,25462
2yrt54q,499000.0,8.0,260.0,kaufen,https://www.immowelt.de/expose/2yrt54q,2021-04-13,21436
2wc724r,499900.0,4.0,140.0,kaufen,https://www.immowelt.de/expose/2wc724r,2021-04-13,24558


In [489]:

firefoxPath = "/Applications/Firefox.app/Contents/MacOS"
client = webbrowser.get("open -a /Applications/Firefox.app %s")
for url in subsel.url[5:10]:
    client.open(url)