In [18]:
import uuid
import random
import math

import dask.dataframe as dd
from glob import glob
import numpy as np
import pandas as pd

from dask.diagnostics import ProgressBar

from helpers import *

from tqdm import tqdm_notebook

pbar = ProgressBar()
pbar.register()

# Generate fake events/features

## Configuration

In [2]:
number_of_cores = 8
devices_fake_path = '../data/device_geolocations_fake/100_devices_with_geolocation.parquet'

# real data
weather_real_path = '../data/weather_real/*/*/*.parquet'

# save to
fake_features_dir_path = '../data/features_fake/' # slash in the end

## Faked devices

In [3]:
devices_raw = dd.read_parquet(glob(devices_fake_path))

In [4]:
devices_raw.head()

[########################################] | 100% Completed |  1.8s


Unnamed: 0,gateway_uuid,geo_lat,geo_lng,zip_code
0,cbf4a64b-561e-4044-98c6-4aa732e8fcd3,53.69,10.1297,22397
1,d0d87dc1-c9cc-48d6-8c6e-22470e4f15db,49.6503,7.3895,55743
2,f14b552b-2e0d-44b5-b8f9-a4b7339d8e14,48.24,11.4805,85757
3,79ee737c-225c-40c0-8fdd-b40669cdcf8f,52.6788,9.0198,31608
4,4a976e33-6a3f-4b53-b780-788b5b2c296d,51.1687,7.0113,42697


## Real weather data

In [5]:
weather_raw = dd.read_parquet(glob(weather_real_path))

In [6]:
locations = weather_raw[['zip_code', 'timestamp', 'temp']]
locations['temp'] = locations['temp'].astype(float)
locations['timestamp'] = locations['timestamp'].astype(str)
locations['zip_code'] = locations['zip_code'].astype(str)
locations['zip_code_prefix'] = locations['zip_code'].str[:1]
locations['timestamp_prefix'] = locations['timestamp'].str[:13]
locations['month'] = locations['timestamp'].str[5:7]
locations['day'] = locations['timestamp'].str[8:10]
locations['hour'] = locations['timestamp'].str[11:13]
locations['month'] = locations['month'].astype(int)
locations['day'] = locations['day'].astype(int)
locations['hour'] = locations['hour'].astype(int)

### Basic temperature for generating outside temp events

In [7]:
locations_grouped = locations.groupby(['zip_code_prefix', 'month', 'day', 'hour'])['temp'].aggregate(['mean', 'std'])
locations_grouped = locations_grouped.reset_index()
temp_in_time = locations_grouped.compute()

[                                        ] | 0% Completed |  0.0s

  return pd.MultiIndex(levels=levels, labels=labels, names=idx.names)


[########################################] | 100% Completed | 36.0s


In [8]:
temp_in_time.head()

Unnamed: 0,zip_code_prefix,month,day,hour,mean,std
0,0,10,1,0,6.613445,2.008861
1,0,10,1,1,6.163866,2.064853
2,0,10,1,2,5.420168,2.248727
3,0,10,1,3,4.785714,2.002179
4,0,10,1,4,4.693277,1.830282


### Timeframe for generated events

In [9]:
weather_timeframes = locations.groupby(['zip_code_prefix'])['timestamp_prefix'].aggregate(['min', 'max']).reset_index().compute()

[########################################] | 100% Completed | 21.0s


In [10]:
weather_timeframes

Unnamed: 0,zip_code_prefix,min,max
0,0,2018-10-01T00,2018-11-30T23
1,1,2018-10-01T00,2018-11-30T23
2,2,2018-10-01T00,2018-11-30T23
3,3,2018-10-01T00,2018-11-30T23
4,4,2018-10-01T00,2018-11-30T23
5,5,2018-10-01T00,2018-11-30T23
6,6,2018-10-01T00,2018-11-30T23
7,7,2018-10-01T00,2018-11-30T23
8,8,2018-10-01T00,2018-11-30T23
9,9,2018-10-01T00,2018-11-30T23


### All days as month, day df

In [11]:
timeframes = temp_in_time.groupby(['month', 'day']).first().reset_index()[['month', 'day']]

## Prepare events

In [12]:
devices = devices_raw[['gateway_uuid', 'zip_code']]
devices['zip_code_prefix'] = devices['zip_code'].str[:1]
devices = devices.compute()
ddata = dd.from_pandas(devices, npartitions=number_of_cores)
devices['factor'] = ddata.apply(lambda x: random.uniform(-1, 1), axis=1, meta=('x', float)).compute()

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s


In [13]:
devices['key'] = 0
timeframes['key'] = 0

In [14]:
events_base = pd.merge(devices, timeframes, on='key')[['gateway_uuid', 'zip_code', 'zip_code_prefix', 'factor', 'month', 'day']]

In [15]:
events_base.head()

Unnamed: 0,gateway_uuid,zip_code,zip_code_prefix,factor,month,day
0,cbf4a64b-561e-4044-98c6-4aa732e8fcd3,22397,2,-0.404335,10,1
1,cbf4a64b-561e-4044-98c6-4aa732e8fcd3,22397,2,-0.404335,10,2
2,cbf4a64b-561e-4044-98c6-4aa732e8fcd3,22397,2,-0.404335,10,3
3,cbf4a64b-561e-4044-98c6-4aa732e8fcd3,22397,2,-0.404335,10,4
4,cbf4a64b-561e-4044-98c6-4aa732e8fcd3,22397,2,-0.404335,10,5


# Generate events (day by day)

## Remove old files

In [16]:
%rm -rf ../data/features_fake/*

## Processing (save day by day)

In [17]:
for row in tqdm_notebook(timeframes.iterrows(), total=61):
    month = row[1]['month']
    day = row[1]['day']
    day_events = events_base[(events_base.month == month) & (events_base.day == day)]
    day_events = day_events[['gateway_uuid', 'zip_code_prefix', 'factor', 'month', 'day']].reset_index(drop=True)
    day_temps = temp_in_time[(temp_in_time.month == month) & (temp_in_time.day == day)].reset_index(drop=True)
    
    
    # device.temperature.outside/value
    temp_outside_events = prepareOutsideTemperatureEvents(day_events, day_temps, month, day, 120, number_of_cores)
    
    # device.temperature.room/value
    temp_room_events = prepareRoomTemperatureEvents(day_events, month, day, 60, number_of_cores)
    
    
    all_day_events = temp_outside_events
    all_day_events = all_day_events.append(
        [
            temp_room_events
        ]).sort_values(['timestamp'], ascending=[1])
    
    all_day_events.to_parquet(fake_features_dir_path, index=True, partition_cols=['month', 'day'])

HBox(children=(IntProgress(value=0, max=61), HTML(value='')))

[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.5s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.3s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.2s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.5s
[########################################] | 100% Completed |  0.1s
[########################################] | 100

[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.3s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.3s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.5s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.1s
[########################################] | 100

[########################################] | 100% Completed |  0.5s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.2s
[########################################] | 100% Completed |  0.1s

