In [19]:
import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict

import plotly
from plotly.graph_objs import Scatter, Layout

plotly.offline.init_notebook_mode(connected=True)
filename_syntax = "data/201408{:02d}_train.txt"


In [3]:
# filename = filename_syntax.format(3)
# df = pd.read_csv(filename, header=None, names=["TaxiID", "Latitude", "Longtitude", "Carriage", "Time"], parse_dates=[4], infer_datetime_format=True)
# df.sort_values("Time", inplace=True)        

# df["Hour"] = df.Time.apply(lambda x: x.hour)

# g = df.groupby(df.Hour)

# drivers_per_hour = {}
# for hour, group in g:
#     drivers_per_hour[hour] = group.TaxiID.unique()

In [21]:
drivers_per_hour = defaultdict(dict)
for i in range(3, 31):
    if i not in [7, 13, 17]:
        filename = filename_syntax.format(i)
        print("{0/6} Process:", filename)
        print("(1/6) Reading file", "at", datetime.now())
        df = pd.read_csv(filename, header=None, names=["TaxiID", "Latitude", "Longtitude", "Carriage", "Time"], parse_dates=[4], infer_datetime_format=True)
        
        print("{2/6} Readed. Sorting", "at", datetime.now())
        df.sort_values("Time", inplace=True)
        
        print("(3/6) Sorted. Adding Hour column", "at", datetime.now())
        df["Hour"] = df.Time.apply(lambda x: x.hour)
        
        print("(4/6) Added. Grouping by hour", "at", datetime.now())
        g = df.groupby(df.Hour)
        
        print("(5/6) Grouped. Getting driver ID per hour", "at", datetime.now())
        for hour, group in g:
            drivers_per_hour[i][hour] = group.TaxiID.unique()
        
        print("(6/6) Finished.", "at", datetime.now())
        
        del df

{0/6} Process: data/20140803_train.txt
(1/6) Reading file at 2017-08-27 21:19:15.028492
{2/6} Readed. Sorting at 2017-08-27 21:21:37.152108
(3/6) Sorted. Adding Hour column at 2017-08-27 21:22:09.914885
(4/6) Added. Grouping by hour at 2017-08-27 21:25:55.905825
(5/6) Grouped. Getting driver ID per hour at 2017-08-27 21:25:58.891981
(6/6) Finished. at 2017-08-27 21:26:07.124860
{0/6} Process: data/20140804_train.txt
(1/6) Reading file at 2017-08-27 21:26:07.125351
{2/6} Readed. Sorting at 2017-08-27 21:28:00.411033
(3/6) Sorted. Adding Hour column at 2017-08-27 21:28:25.642061
(4/6) Added. Grouping by hour at 2017-08-27 21:31:41.295201
(5/6) Grouped. Getting driver ID per hour at 2017-08-27 21:31:44.662362
(6/6) Finished. at 2017-08-27 21:31:53.115312
{0/6} Process: data/20140805_train.txt
(1/6) Reading file at 2017-08-27 21:31:53.115938
{2/6} Readed. Sorting at 2017-08-27 21:33:48.477666
(3/6) Sorted. Adding Hour column at 2017-08-27 21:34:13.239704
(4/6) Added. Grouping by hour at 20

(4/6) Added. Grouping by hour at 2017-08-27 23:31:59.711027
(5/6) Grouped. Getting driver ID per hour at 2017-08-27 23:32:01.612069
(6/6) Finished. at 2017-08-27 23:32:06.207118
{0/6} Process: data/20140828_train.txt
(1/6) Reading file at 2017-08-27 23:32:06.207416
{2/6} Readed. Sorting at 2017-08-27 23:33:01.113612
(3/6) Sorted. Adding Hour column at 2017-08-27 23:33:13.609671
(4/6) Added. Grouping by hour at 2017-08-27 23:35:10.972449
(5/6) Grouped. Getting driver ID per hour at 2017-08-27 23:35:12.894578
(6/6) Finished. at 2017-08-27 23:35:17.485404
{0/6} Process: data/20140829_train.txt
(1/6) Reading file at 2017-08-27 23:35:17.485793
{2/6} Readed. Sorting at 2017-08-27 23:36:10.232631
(3/6) Sorted. Adding Hour column at 2017-08-27 23:36:22.762448
(4/6) Added. Grouping by hour at 2017-08-27 23:38:14.752172
(5/6) Grouped. Getting driver ID per hour at 2017-08-27 23:38:16.814103
(6/6) Finished. at 2017-08-27 23:38:21.712223
{0/6} Process: data/20140830_train.txt
(1/6) Reading file at

In [52]:
import json
a = {}
for k, v in drivers_per_hour.items():
    b = {}
    for k1, l in v.items():
        ll = []
        for c in l:
            ll.append(int(c))
        b[str(k1)] = len(ll)
    a[str(k)] = b

json.dump(a, open("drivers_per_hour.json", "w"))

In [27]:
data = [Scatter(
                x = sorted(value.keys()),
                y = [len(value[i]) for i in sorted(value.keys())],
                mode = 'markers',
                name = 'date ' + str(date),
                visible = False
                ) 
        for date, value in drivers_per_hour.items()]
data[1]['visible'] = True

In [53]:

# plotly.offline.iplot({
#     "data": [Scatter(
#                 x = sorted(drivers_per_hour[3].keys()),
#                 y = [len(drivers_per_hour[3][i]) for i in sorted(drivers_per_hour[3].keys())],
#                 mode = 'markers'
#                 )
#             ],
#     "layout": Layout(title="Drivers per Hour")
# })

drivers_ph = {
    "3": {
        "6": 12951,
        "7": 13115,
        "8": 13176,
        "9": 13215,
        "10": 13193,
        "11": 13194,
        "12": 13210,
        "13": 13176,
        "14": 13160,
        "15": 13133,
        "16": 13124,
        "17": 13138,
        "18": 13126,
        "19": 13105,
        "20": 13065,
        "21": 13070,
        "22": 13048,
        "23": 13033
    },
    "4": {
        "6": 12950,
        "7": 13167,
        "8": 13222,
        "9": 13253,
        "10": 13249,
        "11": 13267,
        "12": 13239,
        "13": 13230,
        "14": 13184,
        "15": 13161,
        "16": 13179,
        "17": 13157,
        "18": 13135,
        "19": 13133,
        "20": 13118,
        "21": 13118,
        "22": 13097,
        "23": 13071
    },
    "5": {
        "6": 12985,
        "7": 13182,
        "8": 13214,
        "9": 13241,
        "10": 13225,
        "11": 13218,
        "12": 13231,
        "13": 13284,
        "14": 13250,
        "15": 13260,
        "16": 13344,
        "17": 13360,
        "18": 13341,
        "19": 13339,
        "20": 13315,
        "21": 13272,
        "22": 13252,
        "23": 13244
    },
    "6": {
        "6": 12803,
        "7": 12964,
        "8": 13017,
        "9": 13013,
        "10": 13017,
        "11": 13007,
        "12": 12995,
        "13": 12966,
        "14": 12951,
        "15": 12942,
        "16": 12931,
        "17": 12911,
        "18": 12911,
        "19": 12925,
        "20": 12902,
        "21": 12894,
        "22": 12878,
        "23": 12854
    },
    "7": {},
    "8": {
        "6": 11642,
        "7": 11842,
        "8": 11931,
        "9": 12467,
        "10": 13095,
        "11": 13089,
        "12": 13084,
        "13": 13087,
        "14": 13079,
        "15": 13070,
        "16": 13074,
        "17": 13071,
        "18": 13050,
        "19": 13026,
        "20": 13006,
        "21": 12996,
        "22": 12980,
        "23": 12950
    },
    "9": {
        "6": 12767,
        "7": 12954,
        "8": 12994,
        "9": 13061,
        "10": 13082,
        "11": 13078,
        "12": 13077,
        "13": 13081,
        "14": 13091,
        "15": 13064,
        "16": 13039,
        "17": 13028,
        "18": 13037,
        "19": 13024,
        "20": 13002,
        "21": 12965,
        "22": 12942,
        "23": 12931
    },
    "10": {
        "6": 12766,
        "7": 12933,
        "8": 13052,
        "9": 13127,
        "10": 13142,
        "11": 13113,
        "12": 13092,
        "13": 13094,
        "14": 13081,
        "15": 13060,
        "16": 13046,
        "17": 13036,
        "18": 13030,
        "19": 13025,
        "20": 13000,
        "21": 12979,
        "22": 12954,
        "23": 12930
    },
    "11": {
        "6": 12781,
        "7": 13036,
        "8": 13104,
        "9": 13165,
        "10": 13160,
        "11": 13150,
        "12": 13141,
        "13": 13175,
        "14": 13360,
        "15": 13580,
        "16": 13556,
        "17": 13548,
        "18": 13520,
        "19": 13521,
        "20": 13491,
        "21": 13476,
        "22": 13469,
        "23": 13446
    },
    "12": {
        "6": 13332,
        "7": 13572,
        "8": 13655,
        "9": 13660,
        "10": 13651,
        "11": 13667,
        "12": 13653,
        "13": 13641,
        "14": 13635,
        "15": 13629,
        "16": 13589,
        "17": 13556,
        "18": 13543,
        "19": 13532,
        "20": 13502,
        "21": 13489,
        "22": 13488,
        "23": 13465
    },
    "13": {},
    "14": {
        "6": 13310,
        "7": 13557,
        "8": 13603,
        "9": 13647,
        "10": 13642,
        "11": 13632,
        "12": 13607,
        "13": 13583,
        "14": 13569,
        "15": 13558,
        "16": 13546,
        "17": 13541,
        "18": 13536,
        "19": 13522,
        "20": 13507,
        "21": 13484,
        "22": 13483,
        "23": 13451
    },
    "15": {
        "6": 13282,
        "7": 13553,
        "8": 13602,
        "9": 13628,
        "10": 13628,
        "11": 13625,
        "12": 13603,
        "13": 13584,
        "14": 13571,
        "15": 13555,
        "16": 13551,
        "17": 13546,
        "18": 13535,
        "19": 13525,
        "20": 13523,
        "21": 13511,
        "22": 13492,
        "23": 13466
    },
    "16": {
        "6": 13292,
        "7": 13456,
        "8": 13529,
        "9": 13579,
        "10": 13587,
        "11": 13586,
        "12": 13582,
        "13": 13580,
        "14": 13559,
        "15": 13546,
        "16": 13542,
        "17": 13557,
        "18": 13559,
        "19": 13539,
        "20": 13524,
        "21": 13509,
        "22": 13487,
        "23": 13481
    },
    "18": {
        "6": 13204,
        "7": 13424,
        "8": 13512,
        "9": 13568,
        "10": 13592,
        "11": 13577,
        "12": 13566,
        "13": 13571,
        "14": 13566,
        "15": 13543,
        "16": 13535,
        "17": 13523,
        "18": 13511,
        "19": 13510,
        "20": 13500,
        "21": 13482,
        "22": 13465,
        "23": 13439
    },
    "19": {
        "6": 13282,
        "7": 13536,
        "8": 13635,
        "9": 13676,
        "10": 13686,
        "11": 13705,
        "12": 13685,
        "13": 13656,
        "14": 13644,
        "15": 13635,
        "16": 13626,
        "17": 13617,
        "18": 13596,
        "19": 13583,
        "20": 13558,
        "21": 13548,
        "22": 13526,
        "23": 13486
    },
    "20": {
        "6": 13301,
        "7": 13523,
        "8": 13607,
        "9": 13663,
        "10": 13680,
        "11": 13710,
        "12": 13685,
        "13": 13682,
        "14": 13699,
        "15": 13686,
        "16": 13673,
        "17": 13663,
        "18": 13659,
        "19": 13633,
        "20": 13630,
        "21": 13617,
        "22": 13587,
        "23": 13552
    },
    "21": {
        "6": 13371,
        "7": 13617,
        "8": 13694,
        "9": 13726,
        "10": 13745,
        "11": 13727,
        "12": 13722,
        "13": 13715,
        "14": 13692,
        "15": 13689,
        "16": 13675,
        "17": 13679,
        "18": 13667,
        "19": 13645,
        "20": 13635,
        "21": 13614,
        "22": 13615,
        "23": 13559
    },
    "22": {
        "6": 13348,
        "7": 13612,
        "8": 13696,
        "9": 13726,
        "10": 13721,
        "11": 13706,
        "12": 13690,
        "13": 13676,
        "14": 13666,
        "15": 13677,
        "16": 13658,
        "17": 13650,
        "18": 13643,
        "19": 13631,
        "20": 13608,
        "21": 13594,
        "22": 13568,
        "23": 13550
    },
    "23": {
        "6": 13305,
        "7": 13490,
        "8": 13564,
        "9": 13630,
        "10": 13642,
        "11": 13637,
        "12": 13608,
        "13": 13620,
        "14": 13601,
        "15": 13587,
        "16": 13576,
        "17": 13567,
        "18": 13578,
        "19": 13562,
        "20": 13554,
        "21": 13541,
        "22": 13526,
        "23": 13508
    }
}

In [54]:
d_df = pd.read_json("drivers_per_hour.json")

In [56]:
d_df.to_csv("a.csv")