# Data Exploration and Analysis

In [3]:
## Load all days into a single dataframe
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from globes import taxi_dir, days_dir
from multiprocessing import Pool, Process, cpu_count

## get X, y in numpy arrays from relevant data
FEATURE_COLS = ['pickup_day', 'pickup_hour', 'pickup_zone_taxi', 'dropoff_zone_taxi', 'pickup_borough', 'dropoff_borough']

""" get feature and label rows from filename
"""
def get_df(filename):
#     print filename
    df = pd.read_csv(filename, parse_dates=['pickup_datetime', 'dropoff_datetime'])
    
    df = df.dropna()
    df['pickup_zone_taxi'] = df['pickup_zone_taxi'].apply(str)
    df['dropoff_zone_taxi'] = df['dropoff_zone_taxi'].apply(str)
    
    df["pickup_day"] = df['pickup_datetime'].apply(lambda t: t.weekday())
    df["pickup_hour"] = df['pickup_datetime'].apply(lambda t: t.hour)

    df = df[FEATURE_COLS]
    return df.fillna(0)

""" get X, y in numpy arrays from relevant data
"""
def getData():
    num_cores = cpu_count()/2
    print "using " + str(num_cores) + " cores"
    pool = Pool(processes=num_cores)
    
    filenames = [os.path.join(taxi_dir, days_dir, f) for f in os.listdir(os.path.join(taxi_dir, days_dir)) if f.endswith('csv')]

    # get dataframes in parallel
    df_arr = pool.map(get_df, filenames)
    pool.terminate()
    
    # concatenate dataframe array into single df
    df = pd.concat(df_arr)

    print df.shape
    return df

df = getData()

using 28 cores
(79023785, 6)


In [4]:
## GET ZONE => NAME and ZONE => BOROUGH json 
import json

from globes import zoneIdToName, zoneIdToBorough

zoneToName = zoneIdToName()
zoneToBorough = zoneIdToBorough()

## Destination proportions

In [13]:
# given a df, return a dictionary of pickup zone id => dataframe of dropoff zones
# sorted by proportion/count
def getDestCounts(df):
    dest_dfs = {}
    pickup_zones = df.groupby('pickup_zone_taxi')
    for pickup_zone, rides in pickup_zones:
        count_df = pd.DataFrame(rides.groupby('dropoff_zone_taxi').size().rename('count'))
        count_df = count_df.sort_values(by="count", ascending=False)
        total_dropoffs = float(count_df["count"].sum())
        count_df["proportion"] = count_df["count"].apply(lambda x: x/total_dropoffs)
        
        dest_dfs[int(float(pickup_zone))] = count_df
    
    return dest_dfs

# get average top dropoff zone proportion for each borough
def boroughDestCounts(df):
    boroughCounts = {}
    dest_dfs = getDestCounts(df)
    
    # for each pickup zone, put the value of its top proportion 
    # dropoff zone into the appropriate borough array
    for pickup_zone in dest_dfs:
        borough = zoneToBorough[pickup_zone]
        count_df = dest_dfs[pickup_zone]
        top_proportion = count_df["proportion"].iloc[0]
        if not boroughCounts.get(borough):
            boroughCounts[borough] = [top_proportion]
        else:
            boroughCounts[borough].append(top_proportion)
    
    boroughAvgs = {}
    for borough in boroughCounts:
        avg = sum(boroughCounts[borough]) / float(len(boroughCounts[borough]))
        boroughAvgs[borough] = avg
    return boroughAvgs

boroughAvgs = boroughDestCounts(df)
print boroughAvgs
        

0.935843172199
0.454545454545
0.266349583829
0.0967696224751
0.769230769231
0.563025210084
0.257862653867
0.132596685083
0.324093816631
0.0954404066459
0.256505576208
0.05582247557
0.0970236514658
0.302057549505
0.359773371105
0.295128939828
0.143188348078
0.135478270305
0.427380952381
0.111320754717
0.285443037975
0.239391513211
0.384615384615
0.118056506849
0.101413405752
0.310658054098
0.404761904762
0.161218660743
0.162183969876
0.1875
0.0883060635226
0.167452176324
0.070586633912
0.0931274900398
0.207023471912
0.141669952382
0.147490589711
0.357320099256
0.345441067457
0.107256698358
0.138489760505
0.197784008742
0.0779778393352
0.619047619048
0.0499407905265
0.211267605634
0.128478598672
0.0620149448254
0.105166421738
0.075466388913
0.305403556772
0.120474077978
0.226180257511
0.107465989716
0.462211454337
0.175226315318
0.328947368421
0.12015503876
0.116411941151
0.196933462236
0.145723197293
0.193321616872
0.437974683544
0.0835747817446
0.0650508851292
0.202265372168
0.06139722

## What's up with Baisley park?

In [14]:
## Baisley park's taxi zone id is 10
def BaisleyPark():
    ID = 10
    dest_counts = getDestCounts(df)
    print dest_counts[10]
    
BaisleyPark()

                   count  proportion
dropoff_zone_taxi                   
10.0                1258    0.095440
230.0                897    0.068052
-1.0                 667    0.050603
48.0                 454    0.034444
161.0                446    0.033837
163.0                430    0.032623
162.0                413    0.031333
164.0                405    0.030726
170.0                389    0.029512
130.0                360    0.027312
79.0                 233    0.017677
68.0                 225    0.017070
186.0                222    0.016842
142.0                199    0.015097
100.0                194    0.014718
132.0                194    0.014718
233.0                182    0.013808
239.0                180    0.013656
107.0                180    0.013656
238.0                180    0.013656
234.0                156    0.011835
229.0                154    0.011683
87.0                 153    0.011608
231.0                153    0.011608
237.0                148    0.011228
1