# Pipe and Filter

In [134]:
import pandas as pd
import json
import numpy as np

In [135]:
file_names = {'skopje_data.json': ('Skopje', 'Скопје', 1000),
              'berovo_data.json': ('Berovo', 'Берово', 2330),
              'bitola_data.json': ('Bitola', 'Битола', 7000),
              'gevgelija_data.json': ('Gevgelija', 'Гевгелија', 1480),
              'kavadarci_data.json': ('Kavadarci', 'Кавадарци', 1430),
              'kumanovo_data.json': ('Kumanovo', 'Куманово', 1300),
              'ohrid_data.json': ('Ohrid', 'Охрид', 6000),
              'prilep_data.json': ('Prilep', 'Прилеп', 7500),
              'strumica_data.json': ('Strumica', 'Струмица', 2400),
              'tetovo_data.json': ('Tetovo', 'Тетово', 1200),
              'struga_data.json': ('Struga', 'Струга', 6330)
              }

In [136]:
main_dataset = None
for k, v in file_names.items():
    city_en, city_mk, post_code = v
    data = json.load(open(k, encoding="utf8"))
    df = pd.json_normalize(data['elements'])

    df['tags.addr:city'] = city_en
    df['tags.addr:city:en'] = city_mk
    df.rename(columns={'tags.addr:city': 'tags.addr:city:mk'})

    df['tags.addr:postcode'] = post_code
    df['tags.addr:postcode'].astype('int64')

    if k == 'skopje_data.json':
        main_dataset = df
    else:
        main_dataset = pd.concat([main_dataset, df], axis=0, ignore_index=True)

In [137]:
class Filter:
    def execute(self, df):
        pass


class Pipe:
    def __init__(self) -> None:
        super().__init__()
        self.filters = list()

    def add_filter(self, filter):
        self.filters.append(filter)

    def run_filters(self, df):
        for filter in self.filters:
            df = filter.execute(df)
        return df.reset_index()

In [138]:
class SelectAttributesFilter(Filter):
    def execute(self, df):
        return df[['lat', 'lon', 'tags.name', 'tags.tourism', 'tags.website', 'tags.stars',
                 'tags.email', 'tags.phone', 'tags.addr:city', 'tags.addr:city:en',
                 'tags.addr:postcode', 'tags.addr:street:en', 'tags.addr:street:mk', 'tags.addr:housenumber',
                 'tags.smoking', 'tags.rooms', 'tags.internet_access', 'tags.internet_access:fee']]

In [139]:
class NonNullAttributeFilter(Filter):
    def execute(self, df):
        return df[~df['tags.name'].isna() & ~df['lat'].isna() & ~df['lon'].isna()]

In [140]:
class TourismTagsAttributeFilter(Filter):
    def execute(self, df):
        return df[df['tags.tourism'].isin(['apartments', 'hotel', 'apartment', 'hostel', 'guest_house']) == True]

In [141]:
class FormatPhoneAttributeFilter(Filter):
    def execute(self, df):
        df['tags.phone'] = df['tags.phone'].str.replace(' ', '')
        return df

In [142]:
class ReplaceNanStarAttributeFilter(Filter):
    def execute(self, df):
        random_stars = np.random.randint(low=1, high=6, size=len(df['tags.stars'].isna()))
        df['tags.stars'] = np.where(df['tags.stars'].isna(), random_stars, df['tags.stars'])
        df['tags.stars'] = df['tags.stars'].astype('int64')
        return df

In [143]:
class ReplaceNanSmokeAttributeFilter(Filter):
    def execute(self, df):
        num_missing = len(df['tags.smoking'].isna())
        random_vector_smoking = np.random.choice(['outside', 'separated', 'isolated'], size=num_missing, p=[0.5, 0.3, 0.2])
        bool_array = np.where(df['tags.smoking'].isna(), random_vector_smoking, df['tags.smoking'])
        df['tags.smoking'] = bool_array
        return df

In [144]:
class ReplaceNanInternetAccessAttributeFilter(Filter):
    def execute(self, df):
        df['tags.internet_access'] = df['tags.internet_access'].replace('wlan', 'yes')
        num_missing = len(df['tags.internet_access'].isna())
        random_vector_internet = np.random.choice(['yes', 'no'], size=num_missing, p=[0.8, 0.2])
        df['tags.internet_access'] = np.where(df['tags.internet_access'].isna(), random_vector_internet, df['tags.internet_access'])
        return df

In [145]:
class ReplaceNanInternetFeesAttributeFilter(Filter):
    def execute(self, df):
        num_missing = len(df['tags.internet_access:fee'].isna())
        random_vector_internet = np.random.choice(['yes', 'no'], size=num_missing, p=[0.2, 0.8])
        df['tags.internet_access:fee'] = np.where(df['tags.internet_access:fee'].isna(), random_vector_internet, df['tags.internet_access:fee'])
        return df

In [146]:
pipe = Pipe()
pipe.add_filter(SelectAttributesFilter())
pipe.add_filter(NonNullAttributeFilter())
pipe.add_filter(TourismTagsAttributeFilter())
pipe.add_filter(FormatPhoneAttributeFilter())
pipe.add_filter(ReplaceNanStarAttributeFilter())
pipe.add_filter(ReplaceNanSmokeAttributeFilter())
pipe.add_filter(ReplaceNanInternetAccessAttributeFilter())
pipe.add_filter(ReplaceNanInternetFeesAttributeFilter())
dataset = pipe.run_filters(main_dataset)

In [147]:
dataset

Unnamed: 0,index,lat,lon,tags.name,tags.tourism,tags.website,tags.stars,tags.email,tags.phone,tags.addr:city,tags.addr:city:en,tags.addr:postcode,tags.addr:street:en,tags.addr:street:mk,tags.addr:housenumber,tags.smoking,tags.rooms,tags.internet_access,tags.internet_access:fee
0,2,41.976658,21.408790,Хотел Водно,hotel,http://hotelimakedonija.com.mk/mk/Hotel/57/Hot...,3,,,Skopje,Скопје,1000,,,,outside,,yes,no
1,4,41.976167,21.435802,Hotel Imperial,hotel,,3,,,Skopje,Скопје,1000,,,,isolated,,yes,no
2,5,41.976232,21.436215,Sun Hotel,hotel,,3,,,Skopje,Скопје,1000,,,,outside,,yes,no
3,6,41.989939,21.454931,hotel porta,hotel,http://www.hotelporta.com.mk/,5,,,Skopje,Скопје,1000,,,,outside,,yes,no
4,7,41.989948,21.419851,Хотел 7,hotel,,5,,,Skopje,Скопје,1000,,,,outside,,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,1786,41.174682,20.672125,Marko apartments,apartment,,3,,,Struga,Струга,6330,,,,outside,,yes,no
223,1787,41.178900,20.678190,jovanoski apartment,apartment,,4,,,Struga,Струга,6330,,,,separated,,yes,no
224,1788,41.174188,20.671223,EXCLUSIVE Apartments Struga,apartment,,2,,,Struga,Струга,6330,,,,outside,,yes,no
225,1791,41.177585,20.675587,Freya,hotel,,4,,,Struga,Струга,6330,,,,isolated,,no,no


In [148]:
dataset.to_csv('clean_dataset.csv', encoding='utf8')