# Hangzhou Data


## Observations
The following sections capture some of the observations on the data.

### Taxi Plate
In Hangzhou, or more accurately for all licenses issued in Zhejiang, the first character of the license plate is 浙 (Zhè). In the data, this sometimes appears as ¿ and sometimes as zhe in the input data files.  Specifically for Hangzhou, the second character of the license plate is A.

## Setup

In [1]:
# These packages are here solely to support the use of the IPython Notebook.
%matplotlib inline
%pylab inline

from IPython.display import HTML, display  # Allows rendering data as HTML, for example DataFrame tables.

import pandas as pd
pd.set_option('display.mpl_style', 'default')
figsize(15, 5)
print('Using pandas version', pd.__version__)

import os
from datetime import datetime

mpl_style had been deprecated and will be removed in a future version.
Use `matplotlib.pyplot.style.use` instead.

  exec(code_obj, self.user_global_ns, self.user_ns)



Populating the interactive namespace from numpy and matplotlib
Using pandas version 0.18.1


## Reading in the Data

With the UNIX time format, it is much faster to read in the data directly and then perform conversions on the time column.  You'll note in the cells below that we do this incrementally over a few cells.

In [2]:
from entity.loader.taxi.taxi_common import sample_df, human_size
from entity.loader.taxi.hangzhou import Hangzhou

start_time = datetime.now()

taxi_file = '/home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01'
reader = Hangzhou()
df = reader.resource_to_dataframe(taxi_file, workers=10)

print('{} to read in {} data'.format(
    datetime.now() - start_time,
    human_size(os.path.getsize(taxi_file))
))
sample_df(df)

Organization Hangzhou created
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_5M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_7M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_6M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_8M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_2M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_3M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_1M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_4M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_9M.txt for DataFrame
Composing /home/dingbat/data/taxi/hangzhou/2011/12/2011-12-01_09/01/result_10M.txt for DataFrame
DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,speed,heading,passenger,state
common_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
zheAT1776,2011-12-01 04:06:47+08:00,30.334283,120.1739,0.0,270,0,0
zheAT6720,2011-12-01 08:47:26+08:00,30.33099,120.10116,0.0,0,1,0
zheAT9629,2011-12-01 17:07:40+08:00,30.331408,120.14047,14.82,170,1,0
zheATA060,2011-12-01 23:00:57+08:00,30.275856,120.09038,48.15,340,0,0
zheATA233,2011-12-01 11:44:01+08:00,30.322386,120.168434,0.0,170,1,0


In [3]:
from entity.loader.taxi.taxi_common import (
    remove_impossible,
    remove_implausible,
    remove_safe_dups,
)
start_time = datetime.now()
df = remove_safe_dups(df)  # Remove rows where all data is the same
df = remove_impossible(df)  # Remove rows with data that is impossible
df = df[~df.index.duplicated()]  # Removes all duplicates based on the index (taxi ID + timestamp)
df.sort_index(inplace=True)
print(datetime.now() - start_time)

0:00:43.123839


In [4]:
df.iloc[:15]  # Use iloc instead of sample to help illustrate sort (next cell)

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,speed,heading,passenger,state
common_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
zhe305680,2011-12-01 06:41:52+08:00,30.335026,120.16463,18.52,310,0,0
zhe305680,2011-12-01 06:44:16+08:00,30.337242,120.15698,32.41,210,0,0
zhe305680,2011-12-01 06:44:36+08:00,30.335821,120.155945,29.82,210,0,0
zhe305680,2011-12-01 06:44:57+08:00,30.334871,120.15504,24.26,280,0,0
zhe305680,2011-12-01 06:46:39+08:00,30.334784,120.144104,35.93,270,0,0
zhe305680,2011-12-01 06:47:00+08:00,30.33477,120.141556,42.97,270,0,0
zhe305680,2011-12-01 06:47:20+08:00,30.334757,120.14071,1.67,310,0,0
zhe305680,2011-12-01 06:47:41+08:00,30.334816,120.140495,3.7,310,0,0
zhe305680,2011-12-01 06:48:01+08:00,30.334799,120.14049,1.85,280,0,0
zhe305680,2011-12-01 06:48:22+08:00,30.334269,120.14026,32.41,170,0,0


In [5]:
# Time range of the data can be pulled from the timestamp index.
df.index.levels[1].min(), df.index.levels[1].max()

(Timestamp('1999-11-30 08:42:48+0800', tz='Asia/Shanghai'),
 Timestamp('2011-12-01 23:59:59+0800', tz='Asia/Shanghai'))

In [6]:
# Provides each plate and will printout total number of taxis
df.index.levels[0]

Index(['zhe305680', 'zhe305691', 'zheAT0000', 'zheAT0001', 'zheAT0002',
       'zheAT0004', 'zheAT0005', 'zheAT0007', 'zheAT0008', 'zheAT0010',
       ...
       'zheATD183', 'zheATD186', 'zheATD188', 'zheATD189', 'zheATD198',
       'zheATD200', 'zhe¿LED', 'zhe¿¿', 'zhe¿¿1', 'zhe¿¿4'],
      dtype='object', name='common_id', length=7997)

In [7]:
# Provides useful information such as:
#   the data types of each column,
#   number of rows in the index,
#   memory use.
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9249070 entries, (zhe305680, 2011-12-01 06:41:52+08:00) to (zhe¿¿4, 2011-12-01 23:01:54+08:00)
Data columns (total 6 columns):
latitude     float64
longitude    float64
speed        float64
heading      int64
passenger    int64
state        int64
dtypes: float64(3), int64(3)
memory usage: 494.0+ MB


In [8]:
# Provides information for each column such as:
#   the number of samples
#   Statistics such as mean, std dev, min, max
df.describe()

Unnamed: 0,latitude,longitude,speed,heading,passenger,state
count,9249070.0,9249070.0,9249070.0,9249070.0,9249070.0,9249070.0
mean,30.27924,120.1632,20.56499,167.5673,0.5725062,0.05999144
std,0.04155185,0.05808465,24.28859,110.1296,0.4947149,0.237471
min,29.90508,119.7016,0.0,0.0,0.0,0.0
25%,30.25083,120.1314,0.0,70.0,0.0,0.0
50%,30.28245,120.1618,9.0,170.0,1.0,0.0
75%,30.31024,120.1824,37.04,260.0,1.0,0.0
max,30.53508,120.6045,240.76,990.0,1.0,1.0


## Convert to Postgres

In [None]:
for c in df.columns:
    print(df.index[:5], df[c][:5])

## Create a LineString
Apply gps filters and create a LineString.  The LineString is used in the database to support geospatial analysis.

In [None]:
from entity.loader.taxi.taxi_common import create_linestring

taxi_partitions = df.groupby(level='common_id', sort=False)
for common_id, taxi_df in taxi_partitions:
    taxi_df.index = taxi_df.index.droplevel(0)
    ls = create_linestring(taxi_df)
    break
ls.json

# Speed Time vs. DB Time
Here we load dataframes for a few days and output the data in a per taxi form to get a better feel for the Speed Time and DB Time over a few days of data.

In [None]:
import pandas as pd
from entity.loader.taxi.hangzhou import Hangzhou
from entity.loader.taxi.taxi_common import (
    sample_df, human_size,
    remove_impossible, remove_implausible, remove_safe_dups,
)


reader = Hangzhou()


def load_day(dir_name):
    df = reader.resource_to_dataframe(dir_name)
    df = remove_safe_dups(df)  # Remove rows where all data is the same
    df = remove_impossible(df)  # Remove rows with data that is impossible
    df = df[~df.index.duplicated()]  # Removes all duplicates based on the index (taxi ID + timestamp)
    df.sort_index(inplace=True)
    return df


days = [
    load_day('/home/dingbat/data/taxi/hangzhou/shamal_processed/2011/12/1'),
#     load_day('/home/dingbat/data/taxi/hangzhou/shamal_processed/2011/12/2'),
#     load_day('/home/dingbat/data/taxi/hangzhou/shamal_processed/2011/12/3'),
]

days_df = pd.concat(days)
days_df = reader.clean(days_df)
sample_df(days_df)

In [None]:
days_df.index.levels[1].min(), days_df.index.levels[1].max()

In [None]:
days_df.index.levels[0]