# Urban Transportation

In [1]:
import numpy                 as np
import pandas                as pd
import matplotlib.pyplot     as plt
import seaborn               as sns
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon
from shapely import wkt
import shapely.vectorized
from itertools import compress

import warnings; warnings.simplefilter('ignore')
%matplotlib inline
sns.set()

In [2]:
yellow_cabs = pd.read_csv("yellow_trips.csv")

In [3]:
yellow_cabs.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,total_amount
0,2014-04-03 18:28:10,2014-04-03 18:54:32,-74.006033,40.706284,-73.918837,40.744946,1,8.7,34.8
1,2014-04-16 15:42:00,2014-04-16 16:07:00,-73.979558,40.749357,0.0,0.0,1,9.37,30.0
2,2014-04-13 18:04:00,2014-04-13 18:07:00,-73.956453,40.775307,-73.954792,40.784992,1,0.89,6.5
3,2014-05-21 19:33:00,2014-05-21 19:45:00,-73.987212,40.75785,-73.960198,40.775472,1,2.4,12.5
4,2014-05-30 16:28:00,2014-05-30 16:51:00,-73.974292,40.755397,-74.011867,40.704222,3,5.58,24.5


In [5]:
uber_2014 = pd.read_csv("uber_trips_2014.csv")

In [6]:
uber_2014.head()

Unnamed: 0,pickup_datetime,pickup_latitude,pickup_longitude,base
0,4/1/14 0:11,40.769,-73.9549,B02512
1,4/1/14 0:17,40.7267,-74.0345,B02512
2,4/1/14 0:21,40.7316,-73.9873,B02512
3,4/1/14 0:28,40.7588,-73.9776,B02512
4,4/1/14 0:33,40.7594,-73.9722,B02512


In [7]:
uber_2014.base.describe()

count     4534327
unique          5
top        B02617
freq      1458853
Name: base, dtype: object

In [11]:
uber_2015 = pd.read_csv("uber_trips_2015.csv")

In [15]:
uber_2015.pickup_location_id.describe()

count    1.427048e+07
mean     1.520574e+02
std      7.159620e+01
min      1.000000e+00
25%      9.200000e+01
50%      1.570000e+02
75%      2.300000e+02
max      2.650000e+02
Name: pickup_location_id, dtype: float64

In [9]:
zones = pd.read_csv("zones.csv")

In [10]:
zones.head()

Unnamed: 0,location_id,borough,zone,service_zone,nta_code
0,1,EWR,Newark Airport,EWR,NJ01
1,2,Queens,Jamaica Bay,Boro Zone,QN61
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone,BX31
3,4,Manhattan,Alphabet City,Yellow Zone,MN22
4,5,Staten Island,Arden Heights,Boro Zone,SI48


In [13]:
zones.location_id.describe()

count    263.000000
mean     132.000000
std       76.065761
min        1.000000
25%       66.500000
50%      132.000000
75%      197.500000
max      263.000000
Name: location_id, dtype: float64

## Assign NTA label to Uber Data

In [219]:
geo_given = pd.read_csv("geographic.csv")

even rows = longitude [::2]
odd rows = lattitude[1::2]

In [221]:
geo_new = pd.read_csv("nynta.csv")

In [222]:
def create_polygons(geo_df):
    poly_dict = {}
    for i in range(geo_df.shape[0]):
        poly_dict[geo_df.NTACode[i]] = wkt.loads(geo_df.the_geom[i])
    return poly_dict

In [310]:
points = uber_2014[:10].apply(lambda x: Point(x["pickup_longitude"], x["pickup_latitude"]), axis=1)

In [309]:
%timeit uber_2014[:10].apply(lambda x: Point(x["pickup_longitude"], x["pickup_latitude"]), axis=1)

1.19 ms ± 33.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [489]:
sample = uber_2014[:100]

In [496]:
# function for iterating and assigning through NTAs given polygons and a dataframe you want to assign to
def assign_nta(df, poly_dict):
    tmp = df.sort_values(by=["pickup_latitude", "pickup_longitude"])
    arr = tmp.values
    known = np.empty((0, 4))
    nta_list = []
    unk = arr
    for i, key in enumerate(poly_dict.keys()):
        print(i, key)
        vals = shapely.vectorized.contains(poly_dict[key],unk[:, 2], unk[:, 1])
        idx = list(compress(range(len(vals)), vals))
        nta_list.extend([key] * len(idx))
        known = np.append(known, arr[idx], axis=0)
        unk = np.delete(unk, idx, axis=0)
    nta_list.extend([np.nan] * unk.shape[0])
    new_df = pd.DataFrame(np.append(known, unk, axis=0), columns=df.columns)
    new_df["NTA"] = nta_list
    return new_df

In [498]:
new_df = assign_nta(uber_2014, poly_dict)

0 BK43
1 BK75
2 BX40
3 BK88
4 BK96
5 QN52
6 QN53
7 BK44
8 QN48
9 BX44
10 QN51
11 QN27
12 BK81
13 BK85
14 BK41
15 BX46
16 BK95
17 QN33
18 BK91
19 BK46
20 BK93
21 BX35
22 QN29
23 MN22
24 QN44
25 BX33
26 MN32
27 MN40
28 BK40
29 QN12
30 BX98
31 BX27
32 QN28
33 BK27
34 SI36
35 BK42
36 BX34
37 BK79
38 MN09
39 BX55
40 BX31
41 QN06
42 QN07
43 BK58
44 BK82
45 BX30
46 QN02
47 BK50
48 BX43
49 MN06
50 MN23
51 SI14
52 QN34
53 MN27
54 BX10
55 BX62
56 QN71
57 QN70
58 MN50
59 BK30
60 BK28
61 SI22
62 SI35
63 SI07
64 BX36
65 QN46
66 BK61
67 BX14
68 QN26
69 BK83
70 QN37
71 QN38
72 BK73
73 MN28
74 BK76
75 BX29
76 BK34
77 MN01
78 BK25
79 MN35
80 QN72
81 BX39
82 BK23
83 MN03
84 QN68
85 SI45
86 BX28
87 BX13
88 BK31
89 BK32
90 BK68
91 MN24
92 MN25
93 MN15
94 BK64
95 QN76
96 QN01
97 QN56
98 BK45
99 BK19
100 BK29
101 BX08
102 BK17
103 BX59
104 SI25
105 SI08
106 BK37
107 SI37
108 QN41
109 QN43
110 BK35
111 BK90
112 QN20
113 MN33
114 QN18
115 MN34
116 BK77
117 BK78
118 MN11
119 QN23
120 MN17
121 QN19
122 SI11
123

In [504]:
new_df.to_csv("uber_2014_NTA.csv", index=False)