# Strings startswith

In [3]:
import numpy  as np
import pandas as pd 
import random
from pandas._testing import rands_array
from bisect import bisect_right

# create random strings
def zufallsdaten(anz):   
    result = pd.DataFrame()         
    result['string_A'] = pd.util.testing.rands_array(10, anz)   
    result['string_B'] = pd.util.testing.rands_array(10, anz)    
    def bearbeite_element( skalar ):
        l = random.randint(2,5)
        return skalar[0:l] 
    result['string_B'] = result['string_B'].apply(bearbeite_element)
    return result

# create data to search in
manystrings = pd.DataFrame(zufallsdaten(1000000)['string_A'])

# create data to search
search_me   = pd.DataFrame(zufallsdaten(100000)['string_B'].drop_duplicates())



In [4]:
manystrings

Unnamed: 0,string_A
0,Am1wQQQqjx
1,BQNhYNxYRF
2,A6qBjCOpS6
3,8RLaZSFiDc
4,qA8cUP8lGC
...,...
999995,CBA9qMuM3v
999996,uExudwYvPN
999997,hnDE9s5fuc
999998,w6Fu5dQalu


In [5]:
# fast startswith alternative. Finds the longest / shortest matching fragment and writes it into the field foundfieldname.
# if find_identical, the strings may not be identical.  
def fast_startswith(df, searchfieldname, foundfieldname, searchseries, find_longest=True, find_identical=True):
    
    # startswith alternative, works only if all strings in searchme have the same length. Also returns the matching fragment
    def startwiths(data, searchme, find_identical):
        prefix = searchme[bisect_right(searchme, data)-1]
        if ((data!=prefix) or find_identical ) and data.startswith(prefix): 
            return prefix    
        
    search = pd.DataFrame(searchseries)
    search.columns = ['searchstring']
    search['len'] = search.searchstring.str.len()
    grouped = search.groupby('len')
    lengroups = grouped.agg(list).reset_index().sort_values('len', ascending=find_longest)  
    result = df.copy()
    result[foundfieldname] = None
    for index, row in lengroups.iterrows():
        result[foundfieldname].update(result[searchfieldname].apply(startwiths, searchme=sorted(row.searchstring), find_identical=find_identical)  )  
        #result[foundfieldname] = result[foundfieldname].fillna(  result[searchfieldname].apply(startwiths, searchme=sorted(row.searchstring))  )
    return result
    
    

In [6]:
def fast_startswith2(df, searchfieldname, foundfieldname, searchseries):

    # startswith alternative, works only if all strings in searchme have the same length. Also returns the matching fragment
    def startwiths(data, searchme):
        prefix = searchme[bisect_right(searchme, data)-1]
        if data.startswith(prefix): return prefix    
    
    def grouped_startswith(searchme, data):
        data[foundfieldname].update(data[searchfieldname].apply(startwiths, searchme=sorted(searchme.searchstring)))
        return list(searchme.searchstring)   
    
    search = pd.DataFrame(searchseries)
    search.columns = ['searchstring']
    search['len'] = search.searchstring.str.len()
    grouped = search.groupby('len')     
    result = df.copy()
    result[foundfieldname] = None
    grouped.apply(grouped_startswith, data=result)
    return result    



In [7]:
%%time 
mask = manystrings.string_A.str.startswith(tuple(search_me.string_B))
result0 = manystrings[mask]
# result0: built-in startswith
# Wall time: 1min 6s 



CPU times: user 1min 8s, sys: 0 ns, total: 1min 8s
Wall time: 1min 8s


In [8]:
%%time
df = fast_startswith(manystrings, 'string_A', 'found', search_me.string_B) 
mask = df.found.notnull()
result1 = df[mask]   
#print( result0.shape[0],   result1.shape[0])
assert result0.shape[0] == result1.shape[0]

# result1: iterate through groups of strings with same length.
# also returns the matching fragment
# Wall time: 6.33 s



CPU times: user 6.56 s, sys: 10.3 ms, total: 6.57 s
Wall time: 6.56 s


In [9]:
%%time
df = fast_startswith2(manystrings, 'string_A', 'found', search_me.string_B) 
mask = df.found.notnull()
result2 = df[mask]    
#print( result0.shape[0],   result2.shape[0])
assert result0.shape[0] == result2.shape[0]

# result2: apply fast startswith method on groups of strings with same length
# also returns the matching fragment
# Wall time: 5.94 s



CPU times: user 6.2 s, sys: 24 ms, total: 6.23 s
Wall time: 6.22 s


In [10]:
# differences? May occur if you use find_longest=False
result = pd.merge(result1,result2, on='string_A', suffixes=('_1','_2'))
mask = (result.found_1 != result.found_2)
result[mask]



Unnamed: 0,string_A,found_1,found_2


In [11]:
# search self
df = fast_startswith(search_me, 'string_B', 'found', search_me.string_B, find_identical=False) 
mask = df.found.notnull()
df[mask]    



Unnamed: 0,string_B,found
1,9noCu,9n
3,q7c,q7
5,AYlW,AY
6,8Xk5,8X
7,Pus0,Pu
...,...,...
99995,L2wbM,L2
99996,OyFZ4,Oy
99997,owAN,ow
99998,5sU,5s
