In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
def make_show_progress():
    start_time = time.time()
    lines_read = 0
    
    def show_progress(chunk_length):
        nonlocal lines_read
        
        lines_read += chunk_length
        elapsed_time = int(time.time() - start_time)
        print('{:,} lines read | time {:,}s'.format(lines_read, elapsed_time))
    
    return show_progress

def load_data():
    input_file = 'nyc-2017-yellow-taxi-trips-to-airport.cvs.gz'
    data_types = {
        'Unnamed: 0': np.int32,
        'VendorID': 'category',
        'passenger_count': np.int8,
        'trip_distance': np.float16,
        'RatecodeID': 'category',
        'store_and_fwd_flag': 'category',
        'PULocationID': 'category',
        'DOLoctionID': 'category',
        'payment_type': 'category',
        'fare_amount': np.float16,
        'extra': np.float16,
        'mta_tax': np.float16,
        'tip_amount': np.float16,
        'tolls_amount': np.float16,
        'improvement_surcharge': np.float16,
        'total_amount': np.float16
    }
    dates_to_parse = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']
    df = pd.DataFrame()
    show_progress = make_show_progress()
    chunk_iterator = pd.read_csv(input_file, compression='gzip', chunksize=100_000,
                                 dtype=data_types, parse_dates=dates_to_parse, infer_datetime_format=True)
    for chunk in chunk_iterator:
        df = pd.concat([df, chunk])
        show_progress(len(chunk))
    return df

In [75]:
df = load_data()

100,000 lines read | time 19s
200,000 lines read | time 38s
300,000 lines read | time 58s
400,000 lines read | time 77s
500,000 lines read | time 97s
600,000 lines read | time 116s
700,000 lines read | time 136s
800,000 lines read | time 155s
900,000 lines read | time 175s
1,000,000 lines read | time 195s
1,100,000 lines read | time 214s
1,200,000 lines read | time 234s
1,300,000 lines read | time 253s
1,400,000 lines read | time 273s
1,500,000 lines read | time 293s
1,600,000 lines read | time 313s
1,700,000 lines read | time 332s
1,800,000 lines read | time 352s
1,900,000 lines read | time 372s
2,000,000 lines read | time 391s
2,100,000 lines read | time 411s
2,200,000 lines read | time 431s
2,300,000 lines read | time 450s
2,400,000 lines read | time 470s
2,500,000 lines read | time 489s
2,533,072 lines read | time 496s


In [76]:
df.columns

Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
       'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'total_amount'],
      dtype='object')

In [77]:
df.shape

(2533072, 18)

In [84]:
print('{:,}'.format(df.memory_usage(index=True, deep=True).sum()/(2**20)))

262.9805164337158


In [85]:
%time df.to_pickle('nyc-2017-yellow-taxi-trips-to-airport.pkl.gz', compression='gzip')

Wall time: 1min 28s


In [3]:
df = pd.read_pickle('nyc-2017-yellow-taxi-trips-to-airport.pkl.gz')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,14,2,2017-03-28 14:56:33,2017-03-28 16:14:19,1,17.40625,2,N,113,132,1,52.0,0.0,0.5,13.203125,0.0,0.300049,66.0
1,18,2,2017-03-28 14:56:35,2017-03-28 15:50:06,2,20.015625,2,N,141,132,1,52.0,0.0,0.5,10.0,5.761719,0.300049,68.5625
2,63,2,2017-03-28 14:56:45,2017-03-28 15:35:29,1,9.3125,1,N,43,138,1,32.0,0.0,0.5,9.84375,0.0,0.300049,42.625
3,128,2,2017-03-28 14:57:00,2017-03-28 15:27:54,1,10.007812,1,N,100,138,1,30.0,0.0,0.5,7.308594,5.761719,0.300049,43.875
4,140,2,2017-03-28 14:57:04,2017-03-28 15:45:30,1,15.8125,2,N,170,132,2,52.0,0.0,0.5,0.0,5.761719,0.300049,58.5625


In [5]:
%time df.to_parquet('nyc-2017-yellow-taxi-trips-to-airport.parquet.gz', compression='gzip')

KeyError: 10

In [4]:
%time df.to_feather('nyc-2017-yellow-taxi-trips-to-airport.feather')

KeyError: 10

In [47]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2533072 entries, 0 to 2533071
Data columns (total 18 columns):
Unnamed: 0               int32
VendorID                 category
tpep_pickup_datetime     object
tpep_dropoff_datetime    object
passenger_count          int8
trip_distance            float16
RatecodeID               category
store_and_fwd_flag       category
PULocationID             category
DOLocationID             int64
payment_type             category
fare_amount              float16
extra                    float16
mta_tax                  float16
tip_amount               float16
tolls_amount             float16
improvement_surcharge    float16
total_amount             float16
dtypes: category(5), float16(8), int32(1), int64(1), int8(1), object(2)
memory usage: 466.3 MB


In [43]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2533072 entries, 0 to 2533071
Data columns (total 18 columns):
Unnamed: 0               int64
VendorID                 int64
tpep_pickup_datetime     object
tpep_dropoff_datetime    object
passenger_count          int64
trip_distance            float64
RatecodeID               int64
store_and_fwd_flag       object
PULocationID             int64
DOLocationID             int64
payment_type             int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtypes: float64(8), int64(7), object(3)
memory usage: 821.3 MB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,14,2,03/28/2017 02:56:33 PM,03/28/2017 04:14:19 PM,1,17.4,2,N,113,132,1,52.0,0.0,0.5,13.2,0.0,0.3,66.0
1,18,2,03/28/2017 02:56:35 PM,03/28/2017 03:50:06 PM,2,20.02,2,N,141,132,1,52.0,0.0,0.5,10.0,5.76,0.3,68.56
2,63,2,03/28/2017 02:56:45 PM,03/28/2017 03:35:29 PM,1,9.31,1,N,43,138,1,32.0,0.0,0.5,9.84,0.0,0.3,42.64
3,128,2,03/28/2017 02:57:00 PM,03/28/2017 03:27:54 PM,1,10.01,1,N,100,138,1,30.0,0.0,0.5,7.31,5.76,0.3,43.87
4,140,2,03/28/2017 02:57:04 PM,03/28/2017 03:45:30 PM,1,15.81,2,N,170,132,2,52.0,0.0,0.5,0.0,5.76,0.3,58.56


In [13]:
df.columns

Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
       'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'total_amount'],
      dtype='object')

In [26]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2533072 entries, 0 to 2533071
Data columns (total 18 columns):
Unnamed: 0               int64
VendorID                 int64
tpep_pickup_datetime     object
tpep_dropoff_datetime    object
passenger_count          int64
trip_distance            float64
RatecodeID               int64
store_and_fwd_flag       object
PULocationID             int64
DOLocationID             int64
payment_type             int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtypes: float64(8), int64(7), object(3)
memory usage: 821.3 MB


In [21]:
def clean_data(df):
    df = df.rename(columns={
        'Unnamed: 0': 'original_row_number',
        'VendorID': 'vendor_id',
        'tpep_pickup_datetime': 'pickup_datetime',
        'tpep_dropoff_datetime': 'dropoff_datetime',
        'RatecodeID': 'ratecode_id',
        'PULocationID': 'pickup_location_id',
        'DOLocationID': 'dropoff_location_id',
    })
    return df

In [22]:
dfc = clean_data(df)

In [24]:
dfc.columns

Index(['original_row_number', 'vendor_id', 'pickup_datetime',
       'dropoff_datetime', 'passenger_count', 'trip_distance', 'ratecode_id',
       'store_and_fwd_flag', 'pickup_location_id', 'dropoff_location_id',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'total_amount'],
      dtype='object')

In [23]:
dfc.head()

Unnamed: 0,original_row_number,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,ratecode_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,14,2,03/28/2017 02:56:33 PM,03/28/2017 04:14:19 PM,1,17.4,2,N,113,132,1,52.0,0.0,0.5,13.2,0.0,0.3,66.0
1,18,2,03/28/2017 02:56:35 PM,03/28/2017 03:50:06 PM,2,20.02,2,N,141,132,1,52.0,0.0,0.5,10.0,5.76,0.3,68.56
2,63,2,03/28/2017 02:56:45 PM,03/28/2017 03:35:29 PM,1,9.31,1,N,43,138,1,32.0,0.0,0.5,9.84,0.0,0.3,42.64
3,128,2,03/28/2017 02:57:00 PM,03/28/2017 03:27:54 PM,1,10.01,1,N,100,138,1,30.0,0.0,0.5,7.31,5.76,0.3,43.87
4,140,2,03/28/2017 02:57:04 PM,03/28/2017 03:45:30 PM,1,15.81,2,N,170,132,2,52.0,0.0,0.5,0.0,5.76,0.3,58.56


In [8]:
import os
os.remove('nyc-2017-yellow-taxi-trips-to-airport.feather')

PermissionError: [WinError 32] Der Prozess kann nicht auf die Datei zugreifen, da sie von einem anderen Prozess verwendet wird: 'nyc-2017-yellow-taxi-trips-to-airport.feather'

In [None]:
def clean_data(df):
    return dfa