# Geovisual Analytics for Shenzhen Taxi Trajectories

Here we'll explore taxi trip data collected from Shenzhen. 

Data information was described in Cheng et al. 2019.
 
Cheng, B., Qian, S., Cao, J., Xue, G., Yu, J., Zhu, Y., ... & Zhang, T. (2019, April). STL: Online Detection of Taxi Trajectory Anomaly Based on Spatial-Temporal Laws. In International Conference on Database Systems for Advanced Applications (pp. 764-779). Springer, Cham.

https://link.springer.com/chapter/10.1007/978-3-030-18579-4_45




### Prepare the city data

### Set up and import libraries

In [None]:
from __future__ import division 
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
plt.style.use('ggplot')
import os
import glob
import datashader as ds
import datashader.transfer_functions as tf

import folium


Read files into a dataframe
This takes several seconds.
Here we use one file as example.

In [None]:
import requests
import shutil
import os
import zipfile

#req = requests.get('https://github.com/cybergis/cybergis-jupyter-notebook-repo/blob/master/geospatial/taxi.zip', stream=True)
#with open('taxi.zip', 'wb') as file:
#    shutil.copyfileobj(req.raw, file)
    
if not os.path.exists('./data'):
    os.mkdir('./data')
    
with zipfile.ZipFile('taxi.zip', 'r') as file:
    file.extractall('./data')

os.listdir('./data/taxi')

In [None]:
path = r'./data/'
#taxi_files = glob.glob(os.path.join(path, "*.txt")) 


filename = './data/taxi/TRK20090923.txt'
column_names = ['taxi_id', 'date_time', 'longitude', 'latitude', 'speed', 'direction', 'occupied','other']
#df_master = pd.concat(pd.read_csv(f, names=column_names) for f in taxi_files)  #glue all data into the dataframe
df_master = pd.read_csv(filename, names=column_names)
df_master['date_time'] = pd.to_datetime(df_master.date_time) # Correct the type in date_time column

### Example of taxi trajectory Data and the amount of data

Show the data information

In [None]:
print(len(df_master))
df_master.head(10)

In [None]:
df = df_master.copy() # Allows you to 'restart' the worksheet without waiting to recreate dataframe

In [None]:
# Use numpy functions for the distance formula

def gps_dist(a, b, c, d):    
    '''Compute the distance (in meters) between two gps locations. Input is assumed to be a = longitude, b = latitude, etc.'''
    r = 0.0174533  # 1 degree in radians
    return 2 * 6371000 * np.arcsin( np.sqrt( # https://en.wikipedia.org/wiki/Haversine_formula
        np.sin(r*(d - b)/2.0)**2 + np.cos(r*b) * np.cos(r*d) * np.sin(r*(c - a)/2.0)**2))

### Visualization of the whole taxi trajectory dataset

Visualize the dataset


In [None]:

maindt = df[df.occupied== 1]
maindt = df[abs(df.longitude -114.05) <= 1]
maindt = maindt[abs(maindt.latitude - 22.5) <= 1]
cvs = ds.Canvas(plot_width=600, plot_height=600)
agg = cvs.points(maindt,'longitude','latitude')
img = tf.shade(agg, cmap=['lightblue','darkblue'],how='log')
img

### Visualization of the whole taxi trajectory dataset in Shenzhen

In [None]:
#Now zooming in on Shenzhen.

maindt = df[abs(df.longitude -114.05) <= 0.35]
maindt = maindt[abs(maindt.latitude - 22.70) <= 0.35]


xrange = np.min(maindt['longitude']),np.max(maindt['longitude'])

yrange = np.min(maindt['latitude']),np.max(maindt['latitude'])

print (xrange,yrange)

xxrange = (113.673267, 114.646188)
yyrange  = (22.365089, 22.864404)


#cvs = ds.Canvas(x_range=xxrange, y_range=yyrange,plot_width=600, plot_height=600)
cvs = ds.Canvas(x_range=xrange, y_range=yrange,plot_width=600, plot_height=600)
agg = cvs.points(maindt,'longitude','latitude')
img = tf.shade(agg, cmap=['lightblue','darkblue'],how='log')
img


In [None]:
import numpy as np


from bokeh.plotting import figure, output_notebook, show

def histogram(x,colors=None):
    hist,edges = np.histogram(x, bins=100)
    p = figure(y_axis_label="Pixels",
               tools='', height=130, outline_line_color=None,
               min_border=0, min_border_left=0, min_border_right=0,
               min_border_top=0, min_border_bottom=0)
    p.quad(top=hist[1:], bottom=0, left=edges[1:-1], right=edges[2:])
    print("min: {}, max: {}".format(np.min(x),np.max(x)))
    show(p)

In [None]:

histogram(agg.values)

In [None]:
import datashader as ds
from datashader import transfer_functions as tf
from datashader.colors import Greys9
Greys9_r = list(reversed(Greys9))[:-2]

histogram(np.log1p(agg.values))

tf.shade(agg, cmap=Greys9_r, how='log')

In [None]:
histogram(tf.eq_hist(agg.values))

#cmapOrange = ['darkred', 'red', 'orangered', 'darkorange', 'orange', 'gold', 'yellow', 'white']
    
tf.shade(agg, cmap=Greys9_r, how='eq_hist')

In [None]:
import datashader as ds
from datashader.bokeh_ext import InteractiveImage
from functools import partial
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno
from IPython.core.display import HTML, display

background = "black"
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))

def create_image():
    #cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    cvs = ds.Canvas(plot_width=600, plot_height=600)
    #agg = cvs.points(df, 'dropoff_x', 'dropoff_y',  ds.count('passenger_count'))
    agg = cvs.points(maindt,'longitude','latitude')
    img = tf.shade(agg, cmap=Hot, how='eq_hist')
    return tf.dynspread(img, threshold=0.5, max_px=4)

#p = base_plot(background_fill_color=background)

create_image()


In [None]:

#from functools import partial


def create_image90():
    cvs = ds.Canvas(plot_width=600, plot_height=600)
    agg = cvs.points(maindt, 'longitude','latitude')
    img = tf.shade(agg.where(agg>np.percentile(agg,90)), cmap=inferno, how='eq_hist')
    return tf.dynspread(img, threshold=0.3, max_px=4)

create_image90()


In [None]:

# extract hour from the timestamp column to create an time_hour column
#maindt['hour'] = maindt['timestamp'].dt.hour

maindt['hour'] = pd.to_datetime(maindt['date_time']).dt.hour.astype('category')

maindt.head(10)

In [None]:
#24 clolrs for 24 hours
colors = ["#FF0000","#FF3F00","#FF7F00","#FFBF00","#FFFF00","#BFFF00","#7FFF00","#3FFF00",
          "#00FF00","#00FF3F","#00FF7F","#00FFBF","#00FFFF","#00BFFF","#007FFF","#003FFF",
          "#0000FF","#3F00FF","#7F00FF","#BF00FF","#FF00FF","#FF00BF","#FF007F","#FF003F",]

def colorized_images():
    cvs = ds.Canvas(plot_width=600, plot_height=600)
    agg = cvs.points(maindt, 'longitude','latitude',ds.count_cat('hour'))
    #agg = cvs.points(df, dataset+'_x', dataset+'_y', ds.count_cat('hour'))
    img = tf.shade(agg, color_key=colors)
    return tf.dynspread(img, threshold=0.3, max_px=4)

colorized_images()