In [1]:
import pandas as pd

In [2]:
%time data_df = pd.read_csv("../datasets/taxi-trips/taxi_trips_small.csv", low_memory=False)

CPU times: user 56.8 s, sys: 4.79 s, total: 1min 1s
Wall time: 1min


### Take a look at the data

In [3]:
data_df.head(3).T

Unnamed: 0,0,1,2
Trip ID,2847377eac0bee949bfa1ca27913ed0d3a1f1b81,284c50ba7f586099fb533bf0d7f1536c6d7108ac,2847e0e16c5fdea8289eb406c12f697fbbde1ae3
Taxi ID,afe9520e229118e6dbb1fa164a31175e2b95a5f1e968d1...,145704ff91badf2808dafedfa719dddb09643f1563f8f6...,25026b4fed9610802a9737c038f487a76836346a67274e...
Trip Start Timestamp,05/18/2019 03:45:00 PM,05/02/2019 06:45:00 PM,05/15/2019 12:00:00 PM
Trip End Timestamp,05/18/2019 04:00:00 PM,05/02/2019 06:45:00 PM,05/15/2019 12:30:00 PM
Trip Seconds,540,480,1109
Trip Miles,0,1,4.92
Pickup Census Tract,1.70313e+10,1.70313e+10,1.7031e+10
Dropoff Census Tract,1.70311e+10,1.70313e+10,1.70311e+10
Pickup Community Area,32,28,3
Dropoff Community Area,8,32,7


### See how much memory is being used (in total and per column)

In [4]:
def bytes2mb(b): return (b / (1024 ** 2))

In [5]:
mem_s = data_df.memory_usage(deep=True, index=False)

In [6]:
mem_df = pd.DataFrame(round(mem_s / (1024 ** 2), 2), columns=['mem_in_megs'])

In [7]:
print("Total memory usage:", round(sum(mem_s) / (1024 ** 3), 2), "GB")

Total memory usage: 8.36 GB


#### Memory usage per column (in megabytes)

In [8]:
mem_df.sort_values("mem_in_megs")

Unnamed: 0,mem_in_megs
Tips,86.3
Dropoff Centroid Longitude,86.3
Dropoff Centroid Latitude,86.3
Pickup Centroid Longitude,86.3
Pickup Centroid Latitude,86.3
Trip Total,86.3
Extras,86.3
Tolls,86.3
Fare,86.3
Community Areas,86.3


### Combine mem usage, data types, unique values and sample values

In [9]:
mem_df\
    .join(pd.DataFrame(data_df.dtypes, columns=["dtypes"]))\
    .join(pd.DataFrame(data_df.nunique(), columns=["nuniques"]))\
    .join(data_df[:1].T).sort_values("mem_in_megs")

Unnamed: 0,mem_in_megs,dtypes,nuniques,0
Tips,86.3,float64,3622,0
Dropoff Centroid Longitude,86.3,float64,590,-87.6189
Dropoff Centroid Latitude,86.3,float64,590,41.8909
Pickup Centroid Longitude,86.3,float64,490,-87.621
Pickup Centroid Latitude,86.3,float64,490,41.885
Trip Total,86.3,float64,15186,9
Extras,86.3,float64,2452,1
Tolls,86.3,float64,441,0
Fare,86.3,float64,8920,8
Community Areas,86.3,float64,77,38


#### Notice that "Payment Type" is a string, perhaps it is just category (with very few unique strings?)

In [10]:
bytes2mb(data_df['Payment Type'].memory_usage(deep=True))

694.0905427932739

In [11]:
data_df['Payment Type'].value_counts()

Cash           5840043
Credit Card    5331367
No Charge        60445
Mobile           29568
Unknown          23192
Prcard           19556
Pcard             4020
Dispute           3211
Prepaid            118
Way2ride             4
Name: Payment Type, dtype: int64

In [12]:
data_df['Payment Type'] = data_df['Payment Type'].astype('category')

In [13]:
data_df['Payment Type'].value_counts()

Cash           5840043
Credit Card    5331367
No Charge        60445
Mobile           29568
Unknown          23192
Prcard           19556
Pcard             4020
Dispute           3211
Prepaid            118
Way2ride             4
Name: Payment Type, dtype: int64

In [14]:
bytes2mb(data_df['Payment Type'].memory_usage(deep=True))

10.788501739501953

In [15]:
print("Total memory usage:", round(sum(data_df.memory_usage(deep=True, index=False)) / (1024 ** 3), 2), "GB")

Total memory usage: 7.69 GB


We went from almost 700 megs to only 10 megs!!!

#### Check out Trip Start/End Timestamp, looks like they are supposed to be timestamps, but pandas thinks they are strings

In [16]:
bytes2mb(data_df['Trip Start Timestamp'].memory_usage(deep=True)) \
    + bytes2mb(data_df['Trip End Timestamp'].memory_usage(deep=True))

1704.365312576294

In [17]:
data_df['Trip Start Timestamp'] = pd.to_datetime(data_df['Trip Start Timestamp'][:10], format="%m/%d/%Y %H:%M:%S %p")
data_df['Trip End Timestamp'] = pd.to_datetime(data_df['Trip End Timestamp'][:10], format="%m/%d/%Y %H:%M:%S %p")

In [18]:
bytes2mb(data_df['Trip Start Timestamp'].memory_usage(deep=True)) \
    + bytes2mb(data_df['Trip End Timestamp'].memory_usage(deep=True))

172.60031127929688

In [19]:
print("Total memory usage:", round(sum(data_df.memory_usage(deep=True, index=False)) / (1024 ** 3), 2), "GB")

Total memory usage: 6.19 GB


#### Notice that Pickup/Dropoff Centroid Locations are strings made up of values already in lat/long, elsewhere in table...drop them!

In [20]:
bytes2mb(data_df['Pickup Centroid Location'].memory_usage(deep=True)) \
    + bytes2mb(data_df['Dropoff Centroid  Location'].memory_usage(deep=True))

1033.4056596755981

In [21]:
data_df.drop(['Pickup Centroid Location', 'Dropoff Centroid  Location'], axis=1, inplace=True)

In [22]:
print("Total memory usage:", round(sum(data_df.memory_usage(deep=True, index=False)) / (1024 ** 3), 2), "GB")

Total memory usage: 5.18 GB


#### Can we convert other columns to categories and save lots of memory?

In [23]:
bytes2mb(data_df['Company'].memory_usage(deep=True)) , \
bytes2mb(data_df['Company'].astype('category').memory_usage(deep=True))

(701.70250415802, 21.59163761138916)

In [24]:
bytes2mb(data_df['Taxi ID'].memory_usage(deep=True)) , \
bytes2mb(data_df['Taxi ID'].astype('category').memory_usage(deep=True))

(1995.1275033950806, 23.369253158569336)

In [25]:
bytes2mb(data_df['Trip ID'].memory_usage(deep=True)) , \
bytes2mb(data_df['Trip ID'].astype('category').memory_usage(deep=True))

(1046.3885383605957, 1409.5385780334473)

In [26]:
data_df['Company'] = data_df['Company'].astype('category')

In [27]:
data_df['Taxi ID'] = data_df['Taxi ID'].astype('category')

#### Where do we stand?

In [36]:
pd.DataFrame(bytes2mb(data_df.memory_usage(deep=True, index=False)))\
    .join(pd.DataFrame(data_df.dtypes, columns=["dtypes"]))\
    .join(pd.DataFrame(data_df.nunique(), columns=["nuniques"]))\
    .join(data_df[:1].T).sort_values("mem_in_megs")

ValueError: columns overlap but no suffix specified: Index([0], dtype='object')

In [29]:
print("Total memory usage:", round(sum(data_df.memory_usage(deep=True, index=False)) / (1024 ** 3), 2), "GB")

Total memory usage: 2.59 GB


### _Awesome_ reference:
https://www.dataquest.io/blog/pandas-big-data/