In [82]:
import re
import numpy as np
import pandas as pd

In [233]:
df = pd.read_csv('../data/df_houses_full_raw.csv')

In [234]:
# See what wee got.
df.head()

Unnamed: 0.1,Unnamed: 0,house_id,city,street,montly_rent,deposit_value,house_rules,size,lat_long,scrapy_datetime
0,0,1719697,Berlin,Nürnberger Straße,EUR 850,EUR 1700,"['Private living room', 'Private Toilet', 'Pri...",Property: 22 m2,"['52.50027', '13.33596']",2021/05/13 - 12:16:03
1,1,1716752,Berlin,Schnellerstraße,EUR 450,EUR 900,"['Shared toilet', 'Shared kitchen', 'Unisex ba...",Property: 90 m2,"['52.45495', '13.51636']",2021/05/13 - 12:16:05
2,2,1716755,Berlin,Schnellerstraße,EUR 475,EUR 950,"['Shared toilet', 'Shared kitchen', 'Unisex ba...",Property: 90 m2,"['52.45495', '13.51636']",2021/05/13 - 12:16:07
3,3,1724139,Berlin,Otto-Braun-Straße,EUR 760,EUR 500,"['Private facilities', 'Private Toilet', 'Priv...",Property: 25 m2,"['52.52449', '13.41863']",2021/05/13 - 12:16:09
4,4,1721869,Berlin,Otto-Braun-Straße,EUR 760,EUR 500,"['Private facilities', 'Private Toilet', 'Priv...",Property: 25 m2,"['52.52449', '13.41863']",2021/05/13 - 12:16:11


In [235]:
df.shape

(1728, 10)

In [236]:
# Look for NaN and clean it.
df.isna().sum()

Unnamed: 0          0
house_id            0
city                0
street              2
montly_rent        13
deposit_value       2
house_rules         0
size                2
lat_long            2
scrapy_datetime     0
dtype: int64

In [237]:
df.dropna(inplace = True)

In [238]:
df.dtypes

Unnamed: 0          int64
house_id            int64
city               object
street             object
montly_rent        object
deposit_value      object
house_rules        object
size               object
lat_long           object
scrapy_datetime    object
dtype: object

In [239]:
df.shape

(1715, 10)

## Cleaning df_houses

In [252]:
# Drop "Unnamed: 0" column.
df.drop(columns = ['Unnamed: 0'], inplace=True)

# Clean "montly_rent"
df['montly_rent'] = df['montly_rent'].apply(lambda x: re.search('\d+', x).group(0))
df['montly_rent'] = df['montly_rent'].astype(int)

# Clean 'deposit_value'
df['deposit_value'] = df['deposit_value'].apply(lambda x: re.search('\d+', x).group(0))
df['deposit_value'] = df['deposit_value'].astype(int)

# Clean 'house_rules'
## we are looking for places that pets are allowed, so we'll filter the 'house_rules' column to find this places.
yes = 'Pets allowed'
maybe = 'Pets negotiable'
no = 'Pets not allowed'

df['pets'] = df['house_rules'].apply(lambda x: yes if yes in x else maybe if maybe in x else no if no in x else np.nan)

# Clean 'size'
## There are some 'Furnished' as size. I'll exclude them.
df = df[df['size'] != 'Furnished']
df['size'] = df['size'].apply(lambda x: x.split(' ')[1])

# Split lat/log
df['lat_long'] = df['lat_long'].apply(lambda x: re.findall('(\d+\.\d+)', x))
df['lat'] = df['lat_long'].apply(lambda x: x[0])
df['long'] = df['lat_long'].apply(lambda x: x[1])

# Clean columns
df.drop(columns = ['house_rules', 'lat_long'], inplace = True)

# Reorder columns
df = df[['house_id', 'scrapy_datetime',	'city',	'street', 'size', 'pets', 'montly_rent', 'deposit_value', 'lat', 'long']]

In [253]:
df.head()

Unnamed: 0,house_id,scrapy_datetime,city,street,size,pets,montly_rent,deposit_value,lat,long
0,1719697,2021/05/13 - 12:16:03,Berlin,Nürnberger Straße,22,Pets not allowed,850,1700,52.50027,13.33596
1,1716752,2021/05/13 - 12:16:05,Berlin,Schnellerstraße,90,Pets not allowed,450,900,52.45495,13.51636
2,1716755,2021/05/13 - 12:16:07,Berlin,Schnellerstraße,90,Pets not allowed,475,950,52.45495,13.51636
3,1724139,2021/05/13 - 12:16:09,Berlin,Otto-Braun-Straße,25,Pets not allowed,760,500,52.52449,13.41863
4,1721869,2021/05/13 - 12:16:11,Berlin,Otto-Braun-Straße,25,Pets not allowed,760,500,52.52449,13.41863


In [254]:
df.to_csv('../data/df_houses_full_cleanned.csv')

## Cleanning nearby_venues dataset

In [286]:
df_venues = pd.read_csv('../data/nearby_venues_full_raw.csv')

In [287]:
df_venues.head()

Unnamed: 0,house_id,house_latitude,hoouse_longitude,venue,venue_address,city,venue_category,venue_latitude,venue_longitude
0,1719697,52.50027,13.33596,Stadtgarten,Spichernstr. (Gilbachstr.),50672 Köln,Park,50.944557,6.936847
1,1719697,52.50027,13.33596,Hommage,Friesenstr. 75,50670 Köln,Café,50.940584,6.941621
2,1719697,52.50027,13.33596,Kompakt,Werderstr. 15-19,50672 Köln,Record Shop,50.943811,6.939438
3,1719697,52.50027,13.33596,Royal Punjab,Venloer Str. 4,50672 Köln,Indian Restaurant,50.941286,6.937419
4,1719697,52.50027,13.33596,Restaurant Acht,Spichernstr. 10,50672 Köln,French Restaurant,50.942707,6.936511


In [294]:
# city

# zip code
df_venues['city'] = df_venues['city'].apply(lambda x: x['city'].split(' '))

# Address

df_venues

TypeError: <lambda>() got an unexpected keyword argument 'axis'

In [299]:
df_venues.shape

(170300, 9)

In [298]:
df_venues['city'].unique()

array(['50672 Köln', '50670 Köln', '50667 Köln', 'Köln', 'Deutschland',
       '50627 Köln', '50674 Köln', '50823 Köln'], dtype=object)

In [263]:
df_venues.isna().sum()

house_id            0
house_latitude      0
hoouse_longitude    0
venue               0
venue_address       0
venue_ZIP_code      0
venue_category      0
venue_latitude      0
venue_longitude     0
dtype: int64

In [262]:
df_venues.shape

(170300, 9)

In [265]:
df_venues.columns

Index(['house_id', 'house_latitude', 'hoouse_longitude', 'venue',
       'venue_address', 'venue_ZIP_code', 'venue_category', 'venue_latitude',
       'venue_longitude'],
      dtype='object')