In [13]:
import pandas as pd
import math

CITY_DATA_PROJECTED_COLUMNS = ['amenity', 'name', 'landuse', 'latitude', 'longitude', 'leisure', 'highway', 'natural', 'public_transport', 'railway', 'shop', 'tourism']
RADIUS = 1.5

def getDistanceFromLatLonInKm(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    def deg2rad(deg) -> float:
        return deg * (math.pi / 180)
    R = 6371 # Radius of the earth in km
    dLat = deg2rad(lat2-lat1)
    dLon = deg2rad(lon2-lon1)
    a = math.sin(dLat/2) * math.sin(dLat/2) + math.cos(deg2rad(lat1)) * math.cos(deg2rad(lat2)) * math.sin(dLon/2) * math.sin(dLon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = R * c # Distance in km
    return d

def computeNearest(row, column, df) -> float:
    lowest = math.inf
    df_masked = df[df[column]]
    for lat1, lon2 in zip(df_masked['latitude'], df_masked['longitude']):
        distance = getDistanceFromLatLonInKm(row['latitude'], row['longitude'], lat1, lon2)
        if distance < lowest:
            lowest = distance
    return lowest

def computeCountWithinRadius(row, column, df, radius) -> float:
    count = 0
    df_masked = df[df[column]]
    for lat1, lon2 in zip(df_masked['latitude'], df_masked['longitude']):
        distance = getDistanceFromLatLonInKm(row['latitude'], row['longitude'], lat1, lon2)
        if distance <= radius:
            count += 1
    return count

rentDf = pd.read_csv('rentfaster.csv')
rentDf = rentDf[rentDf['province'] == 'Alberta']

cityDf = pd.read_csv('osm_helper_data_Alberta/Alberta_Canada/Alberta_Canada_all_amenities.csv', usecols=CITY_DATA_PROJECTED_COLUMNS, low_memory=False)

cityDf['isSchool'] = cityDf['amenity'].isin(['university', 'school', 'college'])
cityDf['isRestaurant'] = cityDf['amenity'].isin(['restaurant', 'cafe', 'bar', 'cafe;fast_food', 'fast_food'])
cityDf['isCemetery'] = (cityDf['amenity'] == 'grave_yard') | (cityDf['landuse'] == 'cemetery')
cityDf['isPolice'] = cityDf['amenity'] == 'police'
cityDf['isHealthCare'] = cityDf['amenity'].isin(['dentist', 'clinic', 'doctors', 'pharmacy', 'hospital'])
cityDf['isCommunityCentre'] = cityDf['amenity'].isin(['social_centre', 'social_facility', 'community_centre'])
cityDf['isPlaceOfWorship'] = cityDf['amenity'] == 'place_of_worship'
cityDf['isLeisure'] = cityDf['leisure'].notna()
cityDf['isPublicTransport'] = cityDf['public_transport'].notna() | cityDf['railway'] | cityDf['highway'].isin(['bus_stop', 'bus_stop;street_lamp', 'platform'])
cityDf['isShop'] = cityDf['shop'].notna()
cityDf['isTourism'] = cityDf['tourism'].notna()

cityDf.drop_duplicates(subset=['longitude', 'latitude', 'name', 'isSchool', 'isRestaurant', 'isCemetery', 'isPolice', 'isHealthCare', 'isCommunityCentre', 'isPlaceOfWorship', 'isLeisure', 'isPublicTransport', 'isShop', 'isTourism'], inplace=True)

# filter out rows that do not fall into any categories
cityDf = cityDf[(cityDf['isSchool'] == True) | (cityDf['isRestaurant'] == True) | (cityDf['isCemetery'] == True) | (cityDf['isPolice'] == True) | (cityDf['isHealthCare'] == True) | (cityDf['isCommunityCentre'] == True) | (cityDf['isPlaceOfWorship'] == True) | (cityDf['isLeisure'] == True) | (cityDf['isPublicTransport'] == True) | (cityDf['isShop'] == True) | (cityDf['isTourism'] == True)]
cityDf = cityDf.drop(columns=['amenity', 'name', 'leisure', 'highway', 'natural', 'public_transport', 'railway', 'shop', 'tourism'])

# Derive additional features
rentDf["nearestSchool"] = rentDf.apply(lambda r: computeNearest(r, "isSchool", cityDf), axis=1)
rentDf['nearestPolice'] = rentDf.apply(lambda r: computeNearest(r, "isPolice", cityDf), axis=1)
rentDf['restaurantCount'] = rentDf.apply(lambda r: computeCountWithinRadius(r, "isRestaurant", cityDf, RADIUS), axis=1)
rentDf['cemeteryCount'] = rentDf.apply(lambda r: computeCountWithinRadius(r, "isCemetery", cityDf, RADIUS), axis=1)
rentDf['healthCareCount'] = rentDf.apply(lambda r: computeCountWithinRadius(r, "isHealthCare", cityDf, RADIUS), axis=1)
rentDf['communityCentreCount'] = rentDf.apply(lambda r: computeCountWithinRadius(r, "isCommunityCentre", cityDf, RADIUS), axis=1)
rentDf['placeOfWorshipCount'] = rentDf.apply(lambda r: computeCountWithinRadius(r, "isPlaceOfWorship", cityDf, RADIUS), axis=1)
rentDf['leisureCount'] = rentDf.apply(lambda r: computeCountWithinRadius(r, "isLeisure", cityDf, RADIUS), axis=1)
rentDf['shopCount'] = rentDf.apply(lambda r: computeCountWithinRadius(r, "isShop", cityDf, RADIUS), axis=1)
rentDf['tourismCount'] = rentDf.apply(lambda r: computeCountWithinRadius(r, "isTourism", cityDf, RADIUS), axis=1)

rentDf.to_csv("datasets/dataset_joined.csv", index=False)