In [None]:
import functions as fn
import numpy as np
import pandas as pd 
import dask.dataframe as dd
import os
from tqdm import tqdm

### Reducing load time for train file

In [None]:
train_path = './train.csv'

In [None]:
n_rows = fn.file_len(train_path)
print (f'Exact number of rows: {n_rows}')

In [None]:
# Peep at the training file header
df_tmp = pd.read_csv(train_path, nrows=5)
df_tmp.head()

In [None]:
df_tmp.info()

We might not need float64 (16 decimal places) for the longitude and latitude values. float32 (7 decimal places) might be just enough.

In [None]:
# Set columns to most suitable type to optimize for memory usage
traintypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

cols = list(traintypes.keys())

In [None]:
chunksize = 5_000_000

In [None]:
df_list = [] # list to hold the batch dataframe

for df_chunk in tqdm(pd.read_csv(train_path, usecols=cols, dtype=traintypes, chunksize=chunksize)):

    df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
    df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    
    # Alternatively, append the chunk to list and merge all
    df_list.append(df_chunk) 

In [None]:
# Merge all dataframes into one dataframe
train_df = pd.concat(df_list)

In [None]:
# Delete the dataframe list to release memory
del df_list

In [None]:
# See what we have loaded
train_df.info()

In [None]:
# Save into feather format, about 1.5Gb. 
train_df.to_feather('nyc_taxi_data_raw.feather')

In [None]:
# load the same dataframe next time directly, without reading the csv file again!
train_df_new = pd.read_feather('nyc_taxi_data_raw.feather')