Boolean Indexing with Numpy

1. Reading CSV files with NumPy

In [1]:
import numpy as np

taxi = np.genfromtxt('nyc_taxis.csv', delimiter = ',', skip_header = 1)
taxi_shape = taxi.shape

2. Boolean Arrays

In [2]:
a = np.array([1, 2, 3, 4, 5])
b = np.array(["blue", "blue", "red", "blue"])
c = np.array([80.0, 103.4, 96.9, 200.3])

a_bool = a <  3
b_bool = b == "blue"
c_bool = c > 100

3. Boolean Indexing with 1D ndarrays

In [3]:
pickup_month = taxi[:,1]

january_bool = pickup_month == 1
january = pickup_month[january_bool]
january_rides = january.shape[0]

february_bool = pickup_month == 2
february = pickup_month[february_bool]
february_rides = february.shape[0]

4. Boolean Indexing with 2D ndarrays

In [4]:
tip_amount = taxi[:,12]
tip_bool = tip_amount > 50
top_tips = taxi[tip_bool, 5:14]

5. Assigning Values in ndarrays

In [5]:
# this creates a copy of our taxi ndarray
taxi_modified = taxi.copy()
taxi_modified[28214, 5] = 1
taxi_modified[:,0] = 16
taxi_modified[1800:1802, 7] = taxi_modified[:, 7].mean()

6. Assigninment Using Boolean Arrays

In [6]:
# this creates a copy of our taxi ndarray
taxi_copy = taxi.copy()
total_amount = taxi_copy[:,13]
taxi_copy[total_amount < 0] = 0 

# create a new column filled with `0`.
zeros = np.zeros([taxi.shape[0], 1])
taxi_modified = np.concatenate([taxi, zeros], axis=1)
print(taxi_modified)

taxi_modified[taxi_modified[:, 5] == 2, 15] = 1
taxi_modified[taxi_modified[:, 5] == 3, 15] = 1
taxi_modified[taxi_modified[:, 5] == 5, 15] = 1

[[2.016e+03 1.000e+00 1.000e+00 ... 6.999e+01 1.000e+00 0.000e+00]
 [2.016e+03 1.000e+00 1.000e+00 ... 5.430e+01 1.000e+00 0.000e+00]
 [2.016e+03 1.000e+00 1.000e+00 ... 3.780e+01 2.000e+00 0.000e+00]
 ...
 [2.016e+03 6.000e+00 3.000e+01 ... 6.334e+01 1.000e+00 0.000e+00]
 [2.016e+03 6.000e+00 3.000e+01 ... 4.475e+01 1.000e+00 0.000e+00]
 [2.016e+03 6.000e+00 3.000e+01 ... 5.484e+01 2.000e+00 0.000e+00]]


* Challenge: Which is the most popular airport?

In [7]:
jfk = taxi[:, 6] == 2
jfk = taxi[jfk]
jfk_count = jfk.shape[0]

laguardia = taxi[:, 6] == 3
laguardia = taxi[laguardia]
laguardia_count = laguardia.shape[0]

newark = taxi[:, 6] == 5
newark = taxi[newark]
newark_count = newark.shape[0]

* Challenge: Calculating Statistics for Trips on Clean Data

trip_mph = taxi[:,7] / (taxi[:,8] / 3600)
cleaned_taxi = taxi[trip_mph < 100]

mean_distance = cleaned_taxi[:, 7].mean()
mean_length = cleaned_taxi[:, 8].mean()
mean_total_amount = cleaned_taxi[:, 13].mean()