In [38]:
import uuid
import random
import math

import dask.dataframe as dd
from glob import glob
import numpy as np
import pandas as pd

from dask.diagnostics import ProgressBar

pbar = ProgressBar()
pbar.register()

# Generate fake devices with geolocation

## Configuration

In [26]:
number_of_devices = 100000
number_of_cores = 8

# real data
weather_real_path = '../data/weather_real/*/*/*.parquet'

# save to
fake_devices_with_geolocation_path = '../data/device_geolocations_fake/{}_devices_with_geolocation.parquet'.format(str(number_of_devices))

In [27]:
area_matrix = [ # lat, lng of the center, radius in km, probability
    [48.24, 11.48, 35, 0.07],
    [51.17, 7.01, 40, 0.24],
    [50.31, 8.86, 50, 0.11],
    [49.65, 7.39, 15, 0.06],
    [48.54, 8.73, 10, 0.01],
    [49.46, 11.23, 20, 0.06],
    [52.73, 13.36, 30, 0.1],
    [52.27, 11.68, 20, 0.07],
    [52.68, 9.02, 50, 0.21],
    [53.69, 10.13, 25, 0.05],
    [51.18, 13.69, 30, 0.02]]

## Generate device ids and geolocations

In [28]:
location_index = np.random.choice(
    [x for x in range(len(area_matrix))],
    number_of_devices,
    p=[loc[3] for loc in area_matrix])

In [29]:
# 1 degree = ~111km
devices = pd.DataFrame({
    'gateway_uuid': [str(uuid.uuid4()) for x in range(number_of_devices)],
    'geo_lat': [round(area_matrix[x][0] + random.uniform(-1 * area_matrix[x][3]/111, area_matrix[x][3]/111), 4) for x in location_index],
    'geo_lng': [round(area_matrix[x][1] + random.uniform(-1 * area_matrix[x][3]/111, area_matrix[x][3]/111), 4) for x in location_index]
})

In [30]:
devices.head()

Unnamed: 0,gateway_uuid,geo_lat,geo_lng
0,9a1d0ea4-087c-4fee-b475-7e72f83a012b,49.4603,11.23
1,60cb92d6-9ea5-4a1b-af94-87b2540ea94a,52.6786,9.0191
2,1e08b7cf-2342-4ecb-9807-3661d7a7f90d,52.6782,9.019
3,04f8fea6-d988-4121-a84e-11729392a173,51.1704,7.0093
4,00ee4943-7a49-4974-9ad9-609bfec47660,52.7294,13.3601


## Find real zip codes for geolocations (based on weather data)

In [31]:
weather_raw = dd.read_parquet(glob(weather_real_path))

In [32]:
locations = weather_raw[['zip_code', 'obs_geo_lat', 'obs_geo_lng']]
locations['zip_code'] = locations['zip_code'].astype(str)
locations['obs_geo_lat'] = locations['obs_geo_lat'].astype(float)
locations['obs_geo_lng'] = locations['obs_geo_lng'].astype(float)
locations = locations.groupby(['zip_code']).first()
locations = locations.reset_index()
locations = locations.compute()

[########################################] | 100% Completed |  3.5s


In [33]:
def nearest_zipcode(y):
    lat = float(y.geo_lat)
    lng = float(y.geo_lng)
    def distance(x):
        lat1 = float(lat)
        lon1 = float(lng)
        lat2 = float(x.obs_geo_lat)
        lon2 = float(x.obs_geo_lng)
        p = math.pi / 180
        a = (
            0.5 - math.cos((lat2 - lat1) * p) / 2 +
            math.cos(lat1 * p) * math.cos(lat2 * p) * (1 - math.cos((lon2 - lon1) * p)) / 2)

        return round(2 * 6371 * 1000 * math.asin(math.sqrt(a))) # distance in meters

    locations_around = (locations
        .where(locations.obs_geo_lat > lat - 0.5)
        .where(locations.obs_geo_lat < lat + 0.5)
        .where(locations.obs_geo_lng > lng - 0.5)
        .where(locations.obs_geo_lng < lng + 0.5)
        .dropna()
        .reset_index())
    
    locations_around['distance'] = locations_around.apply(distance, axis=1)
    return locations_around.loc[locations_around['distance'].idxmin()]['zip_code']

In [34]:
ddata = dd.from_pandas(devices, npartitions=number_of_cores)

In [35]:
devices['zip_code'] = ddata.apply(lambda x: nearest_zipcode(x), meta=('x', str), axis=1).compute()

[########################################] | 100% Completed | 40min 47.4s


In [36]:
devices.head()

Unnamed: 0,gateway_uuid,geo_lat,geo_lng,zip_code
0,9a1d0ea4-087c-4fee-b475-7e72f83a012b,49.4603,11.23,90571
1,60cb92d6-9ea5-4a1b-af94-87b2540ea94a,52.6786,9.0191,31608
2,1e08b7cf-2342-4ecb-9807-3661d7a7f90d,52.6782,9.019,31608
3,04f8fea6-d988-4121-a84e-11729392a173,51.1704,7.0093,42697
4,00ee4943-7a49-4974-9ad9-609bfec47660,52.7294,13.3601,16562


## Save to parquet file

In [37]:
devices.to_parquet(fake_devices_with_geolocation_path)