# Persistenz
* https://stackoverflow.com/questions/17098654/how-to-store-a-dataframe-using-pandas
* https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-perf

* Gut geeignete Verfahren sind Feather und Pickle. Beide sind schnell und bewahren Datatypes.
* Feather ist manchmal störrisch mit column names
* hdf funktioniert nicht gut mit Strings und produziert immer große Dateien

In [1]:
# blab init
import blab
startup_notebook = blab.blab_startup()
%run $startup_notebook

dropbox_path = C:\Users\me\Dropbox
Start Time: 13:33:00


In [2]:
import numpy      as np
import pandas     as pd 
import bpyth      as bpy
import pandasklar as pak 

grid = pak.grid

time: 4.44 s


In [3]:
import sys
import pickle
import json

time: 999 µs


In [4]:
# Testdatensatz
df_test = pak.leute(8000)
df_test = pak.drop_cols(df_test,['merkmale','history'])

time: 2.38 s


In [5]:
def compare_datatypes(df, result):
    df     = pak.analyse_cols(df)
    result = pak.analyse_cols(result)
    löschen = ['mem_usage']
    df     = pak.drop_cols(df, löschen)
    result = pak.drop_cols(result, löschen)    
    return pak.get_different_rows(df,result)

time: 2 ms


## Speichern und Laden in verschiedenen Formaten

In [6]:
# csv-Format
# csv funktioniert nicht

def csv_save(df):
    df.to_csv('persistenz.csv', index=False)
    
def csv_load():
    return pd.read_csv('persistenz.csv')

def csv_save_and_load(df):
    csv_save(df)
    result = csv_load()
    assert pak.check_identical(df, result)

def csv_save_and_load_dt(df):
    csv_save(df)
    result = csv_load()
    return compare_datatypes(df, result)

time: 6 ms


In [7]:
# json-Format
# json funktioniert nicht

def json_save(df):
    df.to_json('persistenz.json')
    
def json_load():
    return pd.read_json('persistenz.json')

def json_save_and_load(df):
    json_save(df)
    result = json_load()
    assert pak.check_identical(df, result)
    
def json_save_and_load_dt(df):
    json_save(df)
    result = json_load()
    return compare_datatypes(df, result) 

time: 5 ms


In [8]:
# pickle-Format

def pickle_save(df):
    df.to_pickle('persistenz.pickle')
    
def pickle_load():
    return pd.read_pickle('persistenz.pickle')

def pickle_save_and_load(df):
    pickle_save(df)
    result = pickle_load()
    assert pak.check_identical(df, result)
    
def pickle_save_and_load_dt(df):
    pickle_save(df)
    result = pickle_load()
    return compare_datatypes(df, result) 

time: 5 ms


In [9]:
# feather-Format

def feather_save(df):
    df.to_feather('persistenz.feather')
    
def feather_load():
    return pd.read_feather('persistenz.feather')

def feather_save_and_load(df):
    feather_save(df)
    result = feather_load()
    assert pak.check_identical(df, result)
    
def feather_save_and_load_dt(df):
    feather_save(df)
    result = feather_load()
    return compare_datatypes(df, result)

time: 6 ms


## Geschwindigkeitsvergleich

In [10]:
%timeit feather_save_and_load(df_test)

588 ms ± 121 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
time: 5.66 s


In [11]:
%timeit pickle_save_and_load(df_test)

340 ms ± 73.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
time: 3.11 s


In [12]:
#%timeit json_save_and_load(df_test)  

time: 1 ms


In [13]:
#%timeit csv_save_and_load(df_test)

time: 1 ms


## Datatypes Unterschiede

In [14]:
r = feather_save_and_load_dt(df_test)
grid(r)

'Keine Datensätze'

time: 425 ms


In [15]:
r = pickle_save_and_load_dt(df_test)
grid(r)

'Keine Datensätze'

time: 475 ms


In [16]:
#r = json_save_and_load_dt(df_test)  
#grid(r)

time: 1.99 ms


In [17]:
#r = csv_save_and_load_dt(df_test)
#grid(r)

time: 0 ns
