In [None]:
import pandas as pd
import numpy as np

FORMATS = ['clipboard', 'csv', 'feather', 'json', 'parquet', 'pickle', 'stata']

def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

In [None]:
for dataset_size, size in [['small', 1e2], ['medium', 1e4], ['large', 1e6]]:
    size = int(size)
    print(f"\n---===Testing {dataset_size} dataset, {size} rows===---")
    df = get_dataset(size)
    df = set_dtypes(df)
    for func in FORMATS:
        print('\t', func, 'writing')
        %time getattr(df, 'to_'+func)('dummy_file_name')
        print('\t', func, 'reading')
        %time getattr(pd, 'read_'+func)('dummy_file_name')


### Summary:pickle is the best data format for python by read/write speed. csv is not a good idea for fast reading/writing big files