__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/38_Search_Strings_Fast.ipynb)__

# Search Strings Fast
* `fast_startswith`: Searches string columns for matching beginnings.<br>
   Like pandas str.startswith(), but much faster for large amounts of data, and it returns the matching fragment.
* `fast_endswith`: Searches string columns for matching endings.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 21:55:21


time: 423 ms


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions

time: 270 ms


## fast_startswith()

In [3]:
?pak.fast_startswith

time: 60.2 ms


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mfast_startswith[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_search[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_found[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msearchfor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfind_longest[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfind_identical[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Searches string columns for matching beginnings.
Like pandas str.startswith(), but much faster for large amounts of data,
and it returns the matching fragment. 
* col_search:     Name of the column to be searched
* col_found:      Names of the column into which the result is to be written
* searchfor:      Series or List of strings to be searched for
* find_longest:   Should the longest substring be given as the result? Otherwise the shorte

In [4]:
# create data to search in
size = 300000 # REDUCE THIS IF NECESSARY
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=10, len_max=10, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

300000 rows to search in


Unnamed: 0,U
0,J5oMYDzufP
1,gbÄiAÄaEo8
2,twPIu2Kj1m


time: 10.2 s


In [5]:
# create data to search
search_me   = pak.random_series( int(size/10),'string',len_min=2, len_max=4)
print(search_me.shape[0], 'strings to search for')
search_me.head(3)

30000 strings to search for


0    yXT
1     Xu
2    Nög
Name: rnd_string, dtype: string

time: 555 ms


In [6]:
%%time 

# Try built-in startswith
# Wall time: 25s @ size = 300000

mask = manystrings.U.str.startswith(tuple(search_me))
result0 = manystrings[mask] # result of the built-in startswith


print('found', result0.shape[0], 'matching rows')
result0.head(3)

found 281714 matching rows
CPU times: user 31.2 s, sys: 54.6 ms, total: 31.3 s
Wall time: 32 s


Unnamed: 0,U
0,J5oMYDzufP
1,gbÄiAÄaEo8
2,twPIu2Kj1m


time: 32 s


In [7]:
%%time

# Try fast_startswith
# Wall time: 779 ms @ size = 300000

df = pak.fast_startswith( manystrings, 'U', 'found', search_me ) 
mask = df.found.notnull()
result1 = df[mask]   # result of fast_startswith

print('found', result1.shape[0], 'matching rows')
assert result0.shape[0] == result1.shape[0]
result1.head(3)

found 281714 matching rows
CPU times: user 1.32 s, sys: 80 µs, total: 1.32 s
Wall time: 1.35 s


Unnamed: 0,U,found
0,J5oMYDzufP,J5
1,gbÄiAÄaEo8,gb
2,twPIu2Kj1m,tw


time: 1.37 s


### find_identical, find_longest

In [8]:
# create data to search in
size = 100000 
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=2, len_max=4, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

100000 rows to search in


Unnamed: 0,U
0,bCÜI
1,Giöf
2,4T1


time: 1.87 s


In [9]:
# search self for longest substring
df = pak.fast_startswith(manystrings, 'U', 'found_longest', manystrings.U, find_identical=False, find_longest=True) 
mask = df.found_longest.notnull()
self1 = df[mask]   
self1

Unnamed: 0,U,found_longest
0,bCÜI,bC
1,Giöf,Gi
2,4T1,4T
3,iODk,iOD
4,S5t,S5
...,...,...
99995,lDi,lD
99996,GiQ3,Gi
99997,mX9,mX
99998,4FS,4F


time: 612 ms


In [10]:
# search self for shortest substring
df = pak.fast_startswith(manystrings, 'U', 'found_shortest', manystrings.U, find_identical=False, find_longest=False) 
mask = df.found_shortest.notnull()
self2 = df[mask]   
self2

Unnamed: 0,U,found_shortest
0,bCÜI,bC
1,Giöf,Gi
2,4T1,4T
3,iODk,iO
4,S5t,S5
...,...,...
99995,lDi,lD
99996,GiQ3,Gi
99997,mX9,mX
99998,4FS,4F


time: 629 ms


In [11]:
# compare longest and shortest
result = pd.merge(self1, self2, how='outer')
assert self1.shape[0] == result.shape[0]
assert self2.shape[0] == result.shape[0]
mask = result.found_shortest != result.found_longest

result[mask]

Unnamed: 0,U,found_longest,found_shortest
3,iODk,iOD,iO
10,xM4p,xM4,xM
11,au3m,au3,au
12,ämlä,äml,äm
17,ÄecJ,Äec,Äe
...,...,...,...
95355,äuwi,äuw,äu
95356,4Jiö,4Ji,4J
95357,auuD,auu,au
95367,pipF,pip,pi


time: 194 ms


## fast_endswith()

In [12]:
# create data to search in
size = 100000 # REDUCE THIS IF NECESSARY
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=10, len_max=10, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

100000 rows to search in


Unnamed: 0,U
0,äwVPwLsTf4
1,NAHgiiiOöÖ
2,eÖdtÜ8CuLk


time: 3.48 s


In [13]:
# create data to search
search_me   = pak.random_series( int(size/10),'string',len_min=2, len_max=4)
print(search_me.shape[0], 'strings to search for')
search_me.head(3)

10000 strings to search for


0    2bZ
1    ijH
2     üQ
Name: rnd_string, dtype: string

time: 232 ms


In [14]:
%%time 

# Try built-in endswith
# Wall time: 5.43 s @ size = 100000

mask = manystrings.U.str.endswith(tuple(search_me))
result0 = manystrings[mask] # result of the built-in startswith


print('found', result0.shape[0], 'matching rows')
result0.head(3)

found 57274 matching rows
CPU times: user 7.51 s, sys: 16 ms, total: 7.52 s
Wall time: 7.62 s


Unnamed: 0,U
2,eÖdtÜ8CuLk
4,nuoeIMEkUC
6,tToüqifeLg


time: 7.63 s


In [15]:
%%time

# Try fast_endswith
# Wall time: 307 ms @ size = 100000

df = pak.fast_endswith( manystrings, 'U', 'found', search_me ) 
mask = df.found.notnull()
result1 = df[mask]   # result of fast_startswith

print('found', result1.shape[0], 'matching rows')
assert result0.shape[0] == result1.shape[0]
result1.head(3)

found 57274 matching rows
CPU times: user 475 ms, sys: 7.94 ms, total: 483 ms
Wall time: 485 ms


Unnamed: 0,U,found
2,eÖdtÜ8CuLk,Lk
4,nuoeIMEkUC,UC
6,tToüqifeLg,Lg


time: 504 ms
