# Pandas Bamboo Testing

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd

from bamboo import bamboo as bb

## Data

In [65]:
data = {'str_col':['hello world, this is a string line... pandas is stronger with bamboo' for _ in range(10000)]}

In [5]:
big_data = {'col'+str(i): ['hello world this is a big string line, pandas is stronger with bamboo' for _ in range(3500)] for i in range(10)}

### DataFrames

In [66]:
df = pd.DataFrame(data)
bdf = bb.DataFrame(data)

## Create DataFrame

In [67]:
pandas_time = %timeit -o pd.DataFrame(big_data)

1.43 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [68]:
bamboo_time = %timeit -o bb.DataFrame(big_data)

239 µs ± 448 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [69]:
compare(pandas_time, bamboo_time)

6.0x faster with bamboo


In [10]:
%%timeit
# from pandas df to bamboo
bdf = bb.DataFrame(df)

27.3 µs ± 139 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [64]:
def compare(pan, bam):
    print(f'{pan.average / bam.average:.1f}x faster with bamboo')

# Accessors

# .at
Executing in the form of a loop to assess at multiple get and set

In [70]:
def pandas_run():
    for i in range(len(df)):
        value = df.at[i, 'str_col']
        df.at[i, 'str_col'] = value
        
pandas_time = %timeit -o pandas_run()

146 ms ± 604 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [74]:
def bamboo_run():
    for i in range(len(bdf)):
        value = bdf.at[i, 'str_col']
        bdf.at[i, 'str_col'] = value
        
bamboo_time = %timeit -o bamboo_run()

6.98 ms ± 31 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [75]:
# result
compare(pandas_time, bamboo_time)

20.9x faster with bamboo


### version 2 chained indexing
with bamboo, chained indexing is allowed for getting and setting

In [76]:
# version 2 chained indexing 
def bamboo_run():
    for i in range(len(bdf['str_col'])):
        value = bdf['str_col'][i]
        bdf['str_col'][i] = value
        
bamboo_time = %timeit -o bamboo_run()

2.34 ms ± 5.87 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [77]:
# result
compare(pandas_time, bamboo_time)

62.3x faster with bamboo


## .loc

In [78]:
def pandas_run():
    for i in range(len(df)):
        value = df.loc[i, 'str_col']
        df.loc[i, 'str_col'] = value
        
pandas_time = %timeit -o pandas_run()

339 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [79]:
def bamboo_run():
    for i in range(len(df)):
        value = bdf.loc[i, 'str_col']
        bdf.loc[i, 'str_col'] = value
    
bamboo_time = %timeit -o bamboo_run()

7.26 ms ± 16.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [80]:
compare(pandas_time, bamboo_time)

46.6x faster with bamboo


# slicing

In [82]:
pandas_time = %timeit -o df.loc[1:100, 'str_col']

8.48 µs ± 25.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [84]:
bamboo_time = %timeit -o bdf.loc[1:100, 'str_col']

1.13 µs ± 2.71 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [85]:
compare(pandas_time, bamboo_time)

7.5x faster with bamboo


# properties

In [80]:
%%timeit
df.shape

368 ns ± 0.972 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [79]:
%%timeit
bdf.shape

128 ns ± 0.225 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [10]:
%%timeit
df.size

3.25 µs ± 30.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [12]:
%%timeit
bdf.size

133 ns ± 0.645 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


# concat
TODO

In [98]:
df2 = pd.DataFrame(data)
bdf2 = bb.DataFrame(data)

In [99]:
data2 = {'col'+str(i): ['dataset 2' for _ in range(3500)] for i in range(10,15)}
df2 = pd.DataFrame(data2)
bdf2 = bb.DataFrame(data2)

In [100]:
%%timeit
pd.concat([df, df2], axis=0)

1.25 ms ± 23.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [101]:
%%timeit
bb.concat([bdf, bdf2], axis=0)

115 µs ± 1.02 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


# Memory Usage

In [87]:
sys.getsizeof(df) / 1024**2

1.1922492980957031

In [96]:
df.memory_usage(deep=True).str_col 

Index          132
str_col    1250000
dtype: int64

In [97]:
# todo 
sum([sys.getsizeof(item) for item in bdf['str_col']]) / 1024**2

1.1157989501953125

# R&D
working zone and todo stack

In [None]:
%%timeit
df.str_col.str.contains('us')

In [210]:
%%timeit
df.loc[df.str_col.str.contains('foo')]

610 µs ± 3.85 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%%timeit
def sameVal(value):
    return value

df.str_col.apply(sameVal)

In [183]:
%%timeit
found = [True if 'us' in value else False for value in t['str_col']]
subset = {}
for str_col in t.keys():
    data =[t[str_col][i] for i in range(len(df)) if found[i]]
    subset[str_col] = data

401 µs ± 2.16 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [203]:
%%timeit
found = [True if 'foo' in value else False for value in t['str_col']]
{str_col: [t[str_col][i] for i in range(len(t['str_col'])) if found[i]] for str_col in t.keys()}

291 µs ± 1.89 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [211]:
%%timeit
found = [idx for idx, value in enumerate(t['str_col']) if 'foo' in value]
subset = {str_col: [t[str_col][i] for i in found] for str_col in t.keys()}

173 µs ± 800 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [194]:
t['idx']

range(0, 3500)

In [199]:
t['str_col'][3499] = 'hello world this is a big string line, googble gobble, one of foo'

In [205]:
found = [idx for idx, value in enumerate(t['str_col']) if 'foo' in value]

In [206]:
found

[3499]

In [87]:
df['str_col'] = df['str_col'].astype('category')

In [89]:
sys.getsizeof(df)

3878

In [17]:
test = {'col': [1,2,3]}
test['col'].extend([3,4,5])

In [18]:
test

{'col': [1, 2, 3, 3, 4, 5]}