# Search Strings Fast
* `fast_startswith`: Searches string columns for matching beginnings.<br>
   Like pandas str.startswith(), but much faster for large amounts of data, and it returns the matching fragment.
* `fast_endswith`: Searches string columns for matching endings.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

In [3]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions

time: 10.8 s


## fast_startswith()

In [3]:
?pak.fast_startswith

time: 75.3 ms (started: 2022-10-31 18:18:41 +01:00)


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mfast_startswith[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_search[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_found[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msearchfor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfind_longest[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfind_identical[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Searches string columns for matching beginnings.
Like pandas str.startswith(), but much faster for large amounts of data,
and it returns the matching fragment. 
* col_search:     Name of the column to be searched
* col_found:      Names of the column into which the result is to be written
* searchfor:      Series or List of strings to be searched for
* find_longest:   Should the longest substring be given as the result? Otherwise the shorte

In [4]:
# create data to search in
size = 300000 # REDUCE THIS IF NECESSARY
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=10, len_max=10, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

300000 rows to search in


Unnamed: 0,U
0,5vie4igg3D
1,vrÜdLwrIKo
2,UvD2ykAFFe


time: 10.2 s (started: 2022-10-31 18:18:41 +01:00)


In [5]:
# create data to search
search_me   = pak.random_series( int(size/10),'string',len_min=2, len_max=4)
print(search_me.shape[0], 'strings to search for')
search_me.head(3)

30000 strings to search for


0    äeä
1     Za
2    CZ7
Name: rnd_string, dtype: string

time: 550 ms (started: 2022-10-31 18:18:51 +01:00)


In [6]:
%%time 

# Try built-in startswith
# Wall time: 25s @ size = 300000

mask = manystrings.U.str.startswith(tuple(search_me))
result0 = manystrings[mask] # result of the built-in startswith


print('found', result0.shape[0], 'matching rows')
result0.head(3)

found 280610 matching rows
CPU times: user 29 s, sys: 32.5 ms, total: 29 s
Wall time: 29.5 s


Unnamed: 0,U
0,5vie4igg3D
1,vrÜdLwrIKo
2,UvD2ykAFFe


time: 29.5 s (started: 2022-10-31 18:18:52 +01:00)


In [7]:
%%time

# Try fast_startswith
# Wall time: 779 ms @ size = 300000

df = pak.fast_startswith( manystrings, 'U', 'found', search_me ) 
mask = df.found.notnull()
result1 = df[mask]   # result of fast_startswith

print('found', result1.shape[0], 'matching rows')
assert result0.shape[0] == result1.shape[0]
result1.head(3)

found 280610 matching rows
CPU times: user 1.22 s, sys: 0 ns, total: 1.22 s
Wall time: 1.25 s


Unnamed: 0,U,found
0,5vie4igg3D,5v
1,vrÜdLwrIKo,vr
2,UvD2ykAFFe,Uv


time: 1.29 s (started: 2022-10-31 18:19:21 +01:00)


### find_identical, find_longest

In [8]:
# create data to search in
size = 100000 
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=2, len_max=4, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

100000 rows to search in


Unnamed: 0,U
0,Ale
1,pN9
2,kia


time: 1.76 s (started: 2022-10-31 18:19:22 +01:00)


In [9]:
# search self for longest substring
df = pak.fast_startswith(manystrings, 'U', 'found_longest', manystrings.U, find_identical=False, find_longest=True) 
mask = df.found_longest.notnull()
self1 = df[mask]   
self1

Unnamed: 0,U,found_longest
0,Ale,Al
1,pN9,pN
2,kia,ki
3,crI,cr
5,äFH,äF
...,...,...
99995,t1w,t1
99996,eBA6,eBA
99997,ONÖ2,ON
99998,S1i,S1


time: 767 ms (started: 2022-10-31 18:19:24 +01:00)


In [10]:
# search self for shortest substring
df = pak.fast_startswith(manystrings, 'U', 'found_shortest', manystrings.U, find_identical=False, find_longest=False) 
mask = df.found_shortest.notnull()
self2 = df[mask]   
self2

Unnamed: 0,U,found_shortest
0,Ale,Al
1,pN9,pN
2,kia,ki
3,crI,cr
5,äFH,äF
...,...,...
99995,t1w,t1
99996,eBA6,eB
99997,ONÖ2,ON
99998,S1i,S1


time: 749 ms (started: 2022-10-31 18:19:25 +01:00)


In [11]:
# compare longest and shortest
result = pd.merge(self1, self2, how='outer')
assert self1.shape[0] == result.shape[0]
assert self2.shape[0] == result.shape[0]
mask = result.found_shortest != result.found_longest

result[mask]

Unnamed: 0,U,found_longest,found_shortest
70,wNÖ0,wNÖ,wN
102,LseQ,Lse,Ls
105,dpiö,dpi,dp
111,Äwu7,Äwu,Äw
113,032v,032,03
...,...,...,...
95353,GX4ö,GX4,GX
95361,UapQ,Uap,Ua
95366,STg3,STg,ST
95368,EHvu,EHv,EH


time: 159 ms (started: 2022-10-31 18:19:26 +01:00)


## fast_endswith()

In [12]:
# create data to search in
size = 100000 # REDUCE THIS IF NECESSARY
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=10, len_max=10, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

100000 rows to search in


Unnamed: 0,U
0,saBfxi6ÜjÄ
1,UrPoKeHHtö
2,öI9LÄg1eHJ


time: 3.42 s (started: 2022-10-31 18:19:26 +01:00)


In [13]:
# create data to search
search_me   = pak.random_series( int(size/10),'string',len_min=2, len_max=4)
print(search_me.shape[0], 'strings to search for')
search_me.head(3)

10000 strings to search for


0     K1
1    BKÜ
2    8V3
Name: rnd_string, dtype: string

time: 212 ms (started: 2022-10-31 18:19:29 +01:00)


In [14]:
%%time 

# Try built-in endswith
# Wall time: 5.43 s @ size = 100000

mask = manystrings.U.str.endswith(tuple(search_me))
result0 = manystrings[mask] # result of the built-in startswith


print('found', result0.shape[0], 'matching rows')
result0.head(3)

found 57826 matching rows
CPU times: user 7.35 s, sys: 192 µs, total: 7.35 s
Wall time: 7.39 s


Unnamed: 0,U
4,CoOsvuXÖQü
5,üAHJeaCU0d
8,aDFWp1VCza


time: 7.43 s (started: 2022-10-31 18:19:29 +01:00)


In [15]:
%%time

# Try fast_endswith
# Wall time: 307 ms @ size = 100000

df = pak.fast_endswith( manystrings, 'U', 'found', search_me ) 
mask = df.found.notnull()
result1 = df[mask]   # result of fast_startswith

print('found', result1.shape[0], 'matching rows')
assert result0.shape[0] == result1.shape[0]
result1.head(3)

found 57826 matching rows
CPU times: user 481 ms, sys: 63 µs, total: 481 ms
Wall time: 497 ms


Unnamed: 0,U,found
4,CoOsvuXÖQü,Qü
5,üAHJeaCU0d,0d
8,aDFWp1VCza,za


time: 532 ms (started: 2022-10-31 18:19:37 +01:00)
