__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/38_Search_Strings_Fast.ipynb)__

# Search Strings Fast
* `fast_startswith`: Searches string columns for matching beginnings.<br>
   Like pandas str.startswith(), but much faster for large amounts of data, and it returns the matching fragment.
* `fast_endswith`: Searches string columns for matching endings.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 22:11:00


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

# copy_on_write
pd.set_option("mode.copy_on_write", True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions



## fast_startswith()

In [3]:
?pak.fast_startswith

[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mfast_startswith[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_search[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_found[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msearchfor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfind_longest[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfind_identical[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Searches string columns for matching beginnings.
Like pandas str.startswith(), but much faster for large amounts of data,
and it returns the matching fragment. 
* col_search:     Name of the column to be searched
* col_found:      Names of the column into which the result is to be written
* searchfor:      Series or List of strings to be searched for
* find_longest:   Should the longest substring be given as the result? Otherwise the shorte

In [4]:
# create data to search in
size = 300000 # REDUCE THIS IF NECESSARY
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=10, len_max=10, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

300000 rows to search in


Unnamed: 0,U
0,6ga1ZiaxKy
1,hXBqBIBoHS
2,uTürQGXÜ40


In [5]:
# create data to search
search_me   = pak.random_series( int(size/10),'string',len_min=2, len_max=4)
print(search_me.shape[0], 'strings to search for')
search_me.head(3)

30000 strings to search for


0    E2GI
1    mn8P
2      4O
Name: rnd_string, dtype: string

In [6]:
%%time 

# Try built-in startswith
# Wall time: 25s @ size = 300000

mask = manystrings.U.str.startswith(tuple(search_me))
result0 = manystrings[mask] # result of the built-in startswith


print('found', result0.shape[0], 'matching rows')
result0.head(3)

found 280755 matching rows
CPU times: user 29.4 s, sys: 3.36 ms, total: 29.4 s
Wall time: 29.6 s


Unnamed: 0,U
0,6ga1ZiaxKy
1,hXBqBIBoHS
2,uTürQGXÜ40


In [7]:
%%time

# Try fast_startswith
# Wall time: 779 ms @ size = 300000

df = pak.fast_startswith( manystrings, 'U', 'found', search_me ) 
mask = df.found.notnull()
result1 = df[mask]   # result of fast_startswith

print('found', result1.shape[0], 'matching rows')
assert result0.shape[0] == result1.shape[0]
result1.head(3)

found 280755 matching rows
CPU times: user 1.27 s, sys: 12 ms, total: 1.28 s
Wall time: 1.31 s


Unnamed: 0,U,found
0,6ga1ZiaxKy,6g
1,hXBqBIBoHS,hX
2,uTürQGXÜ40,uT


### find_identical, find_longest

In [8]:
# create data to search in
size = 100000 
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=2, len_max=4, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

100000 rows to search in


Unnamed: 0,U
0,cE
1,Hu
2,Jo


In [9]:
# search self for longest substring
df = pak.fast_startswith(manystrings, 'U', 'found_longest', manystrings.U, find_identical=False, find_longest=True) 
mask = df.found_longest.notnull()
self1 = df[mask]   
self1

Unnamed: 0,U,found_longest
3,äDHp,äD
4,hfo,hf
5,otR,ot
6,ü2W,ü2
7,8kbY,8k
...,...,...
99995,DIä,DI
99996,V41,V4
99997,puxn,pux
99998,1NGI,1N


In [10]:
# search self for shortest substring
df = pak.fast_startswith(manystrings, 'U', 'found_shortest', manystrings.U, find_identical=False, find_longest=False) 
mask = df.found_shortest.notnull()
self2 = df[mask]   
self2

Unnamed: 0,U,found_shortest
3,äDHp,äD
4,hfo,hf
5,otR,ot
6,ü2W,ü2
7,8kbY,8k
...,...,...
99995,DIä,DI
99996,V41,V4
99997,puxn,pu
99998,1NGI,1N


In [11]:
# compare longest and shortest
result = pd.merge(self1, self2, how='outer')
assert self1.shape[0] == result.shape[0]
assert self2.shape[0] == result.shape[0]
mask = result.found_shortest != result.found_longest

result[mask]

Unnamed: 0,U,found_longest,found_shortest
34,xWTt,xWT,xW
47,4Wpd,4Wp,4W
58,Dosp,Dos,Do
65,fAün,fAü,fA
73,PwÄY,PwÄ,Pw
...,...,...,...
95342,jeej,jee,je
95350,rIMn,rIM,rI
95353,0PÜI,0PÜ,0P
95354,avyD,avy,av


## fast_endswith()

In [12]:
# create data to search in
size = 100000 # REDUCE THIS IF NECESSARY
manystrings = pd.DataFrame(pak.random_series(size,'string',len_min=10, len_max=10, name='U'))
print(manystrings.shape[0], 'rows to search in')
manystrings.head(3)

100000 rows to search in


Unnamed: 0,U
0,VV3bflVex0
1,KMDFUKpa9I
2,Y3fUqz6bcV


In [13]:
# create data to search
search_me   = pak.random_series( int(size/10),'string',len_min=2, len_max=4)
print(search_me.shape[0], 'strings to search for')
search_me.head(3)

10000 strings to search for


0    LPIO
1    C99D
2     ong
Name: rnd_string, dtype: string

In [14]:
%%time 

# Try built-in endswith
# Wall time: 5.43 s @ size = 100000

mask = manystrings.U.str.endswith(tuple(search_me))
result0 = manystrings[mask] # result of the built-in startswith


print('found', result0.shape[0], 'matching rows')
result0.head(3)

found 58361 matching rows
CPU times: user 6.7 s, sys: 4.04 ms, total: 6.7 s
Wall time: 6.74 s


Unnamed: 0,U
0,VV3bflVex0
1,KMDFUKpa9I
3,wQg414üpQS


In [15]:
%%time

# Try fast_endswith
# Wall time: 307 ms @ size = 100000

df = pak.fast_endswith( manystrings, 'U', 'found', search_me ) 
mask = df.found.notnull()
result1 = df[mask]   # result of fast_startswith

print('found', result1.shape[0], 'matching rows')
assert result0.shape[0] == result1.shape[0]
result1.head(3)

found 58361 matching rows
CPU times: user 566 ms, sys: 4.02 ms, total: 570 ms
Wall time: 582 ms


Unnamed: 0,U,found
0,VV3bflVex0,x0
1,KMDFUKpa9I,9I
3,wQg414üpQS,QS
