# Cleaning and Aggregation

In [18]:
import dask.dataframe as dd
import geopandas as gpd
import h3
from shapely import wkt

In [19]:
data_path = r'C:\datas\noah2\OneDrive\Desktop\GSE 580\Final Model\Github Corrections\waze-romania\data'

# read data into Dask
ddf = dd.read_csv(
        data_path + "/RO_9clujnapoca_alerts.000000000000.csv.gz", 
        compression="gzip", 
        blocksize=None, 
        dtype={
            'confidence': 'int8',
            'type': 'category',
            'subtype': 'category',
            'roadType': 'float32', # to accept NA values
            'reliability': 'int8',
            'magvar': 'int16',
            'street': 'object'
        },
)


ddf["ts"] = dd.to_datetime(ddf["ts"], utc=True).dt.tz_convert('Europe/Bucharest')

# Note: after changing timezone to bucharest, we have 2 observations taking place on jan 1 2022
ddf["date"] = ddf["ts"].dt.date
ddf["dayofweek"] = ddf["ts"].dt.dayofweek # monday = 0, sunday = 6
ddf['year'] = ddf['ts'].dt.year
ddf['month'] = ddf['ts'].dt.month
ddf['hour'] = ddf['ts'].dt.hour
ddf['quarter'] = ddf['ts'].dt.quarter
ddf['timeofday'] = 1 # to be set to Nielsen Audio dayparting times
ddf['timeofday'] = ((ddf.hour>5)& (ddf.hour<10))*1 + ((ddf.hour>9) & (ddf.hour<16))*2 + \
((ddf.hour>15) & (ddf.hour<20))*3 + ((ddf.hour>19) & (ddf.hour<=23))*4 + ((ddf.hour>=0) & (ddf.hour<6))*5

# computations with geopandas and pandas
df = ddf.compute()

geocsv = df.to_csv(data_path + "/geodata.csv", index = False)

df['coordinates'] = df['geoWKT'].apply(wkt.loads)
gdf = gpd.GeoDataFrame(df, geometry='coordinates')
df['lon'] = gdf.geometry.x
df['lat'] = gdf.geometry.y
df = df.drop(['coordinates'], axis=1)

# adding in rush hour times sent by our Romanian colleagues
def rush(series): 
    if (series>=7) & (series<=9):
        return "Morning Rush"
    elif (series>=16) & (series<=19):
        return "Afternoon Rush"
    else:
        return "No Rush"
df['rush'] = df['hour'].apply(rush)


# mapping obs to h3 hexagons
for i in [6,7,8,9,10]:
    df["h" + str(i)] = df.apply(lambda x: h3.geo_to_h3(x["lat"], x["lon"], i), axis=1)


## Exporting full cleaned data

In [22]:
df.to_csv(user_path + "/fullclean.csv", index = False)


In [26]:
df

Unnamed: 0,city,confidence,nThumbsUp,street,uuid,country,type,subtype,roadType,reliability,...,quarter,timeofday,lon,lat,rush,h6,h7,h8,h9,h10
0,Cluj-Napoca,0,,,e8cf23ee-d233-48f7-b261-004693f2e4c7,RO,WEATHERHAZARD,,1.0,5,...,1,4,23.625034,46.771926,No Rush,861e0b38fffffff,871e0b38cffffff,881e0b38c7fffff,891e0b38c6bffff,8a1e0b38c69ffff
1,Cluj-Napoca,0,,,0d6d9120-2d54-440f-befc-64bf82f029d5,RO,ACCIDENT,,1.0,5,...,2,2,23.595964,46.797543,No Rush,861e0b387ffffff,871e0b3aaffffff,881e0b3aa5fffff,891e0b3aa5bffff,8a1e0b3aa587fff
2,Cluj-Napoca,0,,,eea4b83b-4f52-4c06-bf45-d6fabdcf16a5,RO,JAM,,4.0,5,...,4,3,23.619240,46.779326,Afternoon Rush,861e0b38fffffff,871e0b38cffffff,881e0b38c5fffff,891e0b38c47ffff,8a1e0b38c467fff
3,Cluj-Napoca,0,,,ee62bf17-20cd-4d50-8dc1-625e854d8907,RO,JAM,,20.0,5,...,2,1,23.602817,46.772847,Morning Rush,861e0b38fffffff,871e0b38effffff,881e0b38e9fffff,891e0b38e9bffff,8a1e0b38e98ffff
4,Cluj-Napoca,0,,,fcbe7eb4-dd4f-4474-8bf8-a7480b41fd43,RO,WEATHERHAZARD,,1.0,5,...,1,2,23.549190,46.757992,No Rush,861e0b387ffffff,871e0b382ffffff,881e0b39c9fffff,891e0b38267ffff,8a1e0b38264ffff
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441451,Cluj-Napoca,0,,Str. Cardinal Iuliu Hossu,73073bf7-3cc0-4857-af77-a400283d1a00,RO,WEATHERHAZARD,HAZARD_ON_ROAD_TRAFFIC_LIGHT_FAULT,2.0,5,...,3,2,23.584121,46.771376,No Rush,861e0b387ffffff,871e0b383ffffff,881e0b383bfffff,891e0b383a3ffff,8a1e0b383a07fff
441452,Cluj-Napoca,0,,Str. Frederic Joliot Curie,2d4fa59e-84cd-4f97-97fa-3d3f52c08266,RO,WEATHERHAZARD,HAZARD_ON_ROAD_TRAFFIC_LIGHT_FAULT,2.0,5,...,4,3,23.593026,46.772026,Afternoon Rush,861e0b387ffffff,871e0b38effffff,881e0b3817fffff,891e0b38ed3ffff,8a1e0b38ed27fff
441453,Cluj-Napoca,0,,Str. Ion Ionescu de la Brad,ebb0a97f-591e-47a3-af61-06fe28125fa1,RO,WEATHERHAZARD,HAZARD_ON_ROAD_TRAFFIC_LIGHT_FAULT,2.0,5,...,4,3,23.654435,46.788682,Afternoon Rush,861e0b38fffffff,871e0b38dffffff,881e0b38d9fffff,891e0b38d8bffff,8a1e0b38d8a7fff
441454,Florești,0,,Centura Grigorescu - Florești,e34304f5-750d-4ccc-b302-6efe964fec62,RO,WEATHERHAZARD,HAZARD_ON_ROAD_TRAFFIC_LIGHT_FAULT,2.0,5,...,4,3,23.528942,46.764580,Afternoon Rush,861e0b387ffffff,871e0b382ffffff,881e0b3825fffff,891e0b38253ffff,8a1e0b382507fff


## Aggregation

Here we generate grouped data across many different aggregations.

In [27]:


# by date
datehex = df.groupby(["date", "h10"], as_index = False)["uuid"].count()

datehex = datehex.rename(columns = {'uuid': 'AlertCounts'})

datehex.to_csv(user_path + "/datehex.csv", index = False)

# by day of the week
weekdayhextype = df.groupby(["dayofweek", "h10", "type"], as_index = False)["uuid"].count()

weekdayhextype = weekdayhextype.rename(columns = {'uuid': 'AlertCount'})

weekdayhextype.to_csv(user_path + "/weekdayhextype.csv", index = False)


weekdayhex = df.groupby(["dayofweek", "h10"], as_index = False)["uuid"].count()

weekdayhex = weekdayhex.rename(columns = {'uuid': 'AlertCount'})

weekdayhex.to_csv(user_path + "/weekdayhex.csv", index = False)

# by hour of the day
hourhextype = df.groupby(["hour", "h10", "type"], as_index = False)["uuid"].count()

hourhextype = hourhextype.rename(columns = {'uuid': 'AlertCount'})

hourhextype = hourhextype[hourhextype.AlertCount > 0]

hourhextype.to_csv(user_path + "/hourhextype.csv", index = False)


hourhex = df.groupby(["hour", "h10"], as_index = False)["uuid"].count()

hourhex = hourhex.rename(columns = {'uuid': 'AlertCount'})

hourhex.to_csv(user_path + "/hourhex.csv", index = False)

# by rush hour designation
rushhex = df.groupby(["rush", "h10"], as_index = False)["uuid"].count()

rushhex = rushhex.rename(columns = {'uuid': 'AlertCount'})

rushhex.to_csv(user_path + "/rushhex.csv", index = False)


rushhexweekday = df.groupby(["rush", "h10", "dayofweek"], as_index = False)["uuid"].count()

rushhexweekday = rushhexweekday.rename(columns = {'uuid': 'AlertCount'})

rushhexweekday.to_csv(user_path + "/rushhexweekday.csv", index = False)