# Cleanup Strings
* `remove_str`: Removes a list of unwanted substrings from a Series of strings.
* `remove_words`: Removes a list of unwanted words from a Series of strings.
* `replace_str`: Replaces substrings from a Series of strings according to a dict.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

In [3]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions

time: 10.8 s


In [3]:
# create test data to search in
size = 100000
a = pak.random_series( size, 'string', len_min=10, len_max=10,           p_dup=0.9)
b = pak.random_series( size, 'choice', choice=['YO','UFF', 'HI'],        p_dup=0.9)
c = pak.random_series( size, 'string', len_min=10, len_max=10,           p_dup=0.9)
df_test = pak.dataframe([a,b,c])

trenn = ''
df_test['nosep'] =     trenn + df_test.B + trenn + df_test.A + trenn + df_test.B + trenn + df_test.C + trenn + df_test.B
trenn = ' '
df_test['space'] =  trenn + df_test.B + trenn + df_test.A + trenn + df_test.B + trenn + df_test.C + trenn + df_test.B
trenn = ','
df_test['comma'] =  trenn + df_test.B + trenn + df_test.A + trenn + df_test.B + trenn + df_test.C + trenn + df_test.B
df_test = pak.drop_cols(df_test, list('ABC'))
df_test

Unnamed: 0,nosep,space,comma
0,YOgefgegbgfeYOadfacgdafaYO,YO gefgegbgfe YO adfacgdafa YO,",YO,gefgegbgfe,YO,adfacgdafa,YO"
1,HIgaebdfgfccHIfbaaebgcfeHI,HI gaebdfgfcc HI fbaaebgcfe HI,",HI,gaebdfgfcc,HI,fbaaebgcfe,HI"
2,UFFcfgdcfcfbaUFFgceecddfcdUFF,UFF cfgdcfcfba UFF gceecddfcd UFF,",UFF,cfgdcfcfba,UFF,gceecddfcd,UFF"
3,HIgabdgdaefbHIbdgfbccgeeHI,HI gabdgdaefb HI bdgfbccgee HI,",HI,gabdgdaefb,HI,bdgfbccgee,HI"
4,HIcgfcffbgegHIdddebgfffdHI,HI cgfcffbgeg HI dddebgfffd HI,",HI,cgfcffbgeg,HI,dddebgfffd,HI"
...,...,...,...
99995,UFFggafebfdcbUFFefdbcbababUFF,UFF ggafebfdcb UFF efdbcbabab UFF,",UFF,ggafebfdcb,UFF,efdbcbabab,UFF"
99996,UFFgagfacgdbbUFFcfeccdfedfUFF,UFF gagfacgdbb UFF cfeccdfedf UFF,",UFF,gagfacgdbb,UFF,cfeccdfedf,UFF"
99997,UFFgegafebgdaUFFaegdeecdagUFF,UFF gegafebgda UFF aegdeecdag UFF,",UFF,gegafebgda,UFF,aegdeecdag,UFF"
99998,UFFdcecedeaacUFFbadbaaddfgUFF,UFF dcecedeaac UFF badbaaddfg UFF,",UFF,dcecedeaac,UFF,badbaaddfg,UFF"


time: 14.3 s


## remove_str()

In [4]:
?pak.remove_str

time: 83.9 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mremove_str[0m[0;34m([0m[0mseries[0m[0;34m,[0m [0mremove_list[0m[0;34m,[0m [0msafemode[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Removes a list of unwanted substrings from a Series of strings.
* remove_list: list of substrings to remove
* safemode:    Selects the algorithm.
               safemode=True:  Each substring is removed separately
               safemode=False: Works with one regular expression.
                               Special characters such as asterisks must be backslashed.    
[0;31mFile:[0m      /media/me/DATA/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/string.py
[0;31mType:[0m      function


In [5]:
# Test data
cols = ['nosep']
df = df_test[cols].copy()
df.head(3)

Unnamed: 0,nosep
0,YOgefgegbgfeYOadfacgdafaYO
1,HIgaebdfgfccHIfbaaebgcfeHI
2,UFFcfgdcfcfbaUFFgceecddfcdUFF


time: 39.3 ms


In [6]:
# compare orig / after remove_str
for col in df.columns:
    df[col+'_remove_str'] = pak.remove_str( df[col], ['YO', 'UFF', 'HI'])
df

Unnamed: 0,nosep,nosep_remove_str
0,YOgefgegbgfeYOadfacgdafaYO,gefgegbgfeadfacgdafa
1,HIgaebdfgfccHIfbaaebgcfeHI,gaebdfgfccfbaaebgcfe
2,UFFcfgdcfcfbaUFFgceecddfcdUFF,cfgdcfcfbagceecddfcd
3,HIgabdgdaefbHIbdgfbccgeeHI,gabdgdaefbbdgfbccgee
4,HIcgfcffbgegHIdddebgfffdHI,cgfcffbgegdddebgfffd
...,...,...
99995,UFFggafebfdcbUFFefdbcbababUFF,ggafebfdcbefdbcbabab
99996,UFFgagfacgdbbUFFcfeccdfedfUFF,gagfacgdbbcfeccdfedf
99997,UFFgegafebgdaUFFaegdeecdagUFF,gegafebgdaaegdeecdag
99998,UFFdcecedeaacUFFbadbaaddfgUFF,dcecedeaacbadbaaddfg


time: 199 ms


## remove_words()

In [7]:
?pak.remove_words

time: 25.4 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mremove_words[0m[0;34m([0m[0mseries[0m[0;34m,[0m [0mremove_list[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Removes a list of unwanted words from a Series of strings.
Works by regular expression, so special characters such as asterisks must be backslashed.  
[0;31mFile:[0m      /media/me/DATA/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/string.py
[0;31mType:[0m      function


In [8]:
# Test data
df = df_test.copy()
df.head(3)

Unnamed: 0,nosep,space,comma
0,YOgefgegbgfeYOadfacgdafaYO,YO gefgegbgfe YO adfacgdafa YO,",YO,gefgegbgfe,YO,adfacgdafa,YO"
1,HIgaebdfgfccHIfbaaebgcfeHI,HI gaebdfgfcc HI fbaaebgcfe HI,",HI,gaebdfgfcc,HI,fbaaebgcfe,HI"
2,UFFcfgdcfcfbaUFFgceecddfcdUFF,UFF cfgdcfcfba UFF gceecddfcd UFF,",UFF,cfgdcfcfba,UFF,gceecddfcd,UFF"


time: 69.4 ms


In [9]:
# compare orig / after remove_str / after remove_words
for col in df.columns:
    df[col+'_str'] =   pak.remove_str(   df[col], ['YO', 'UFF', 'HI'])
    df[col+'_words'] = pak.remove_words( df[col], ['YO', 'UFF', 'HI'])

pak.move_cols(df).head()

Unnamed: 0,comma,comma_str,comma_words,nosep,nosep_str,nosep_words,space,space_str,space_words
0,",YO,gefgegbgfe,YO,adfacgdafa,YO",",,gefgegbgfe,,adfacgdafa,",",,gefgegbgfe,,adfacgdafa,",YOgefgegbgfeYOadfacgdafaYO,gefgegbgfeadfacgdafa,YOgefgegbgfeYOadfacgdafaYO,YO gefgegbgfe YO adfacgdafa YO,gefgegbgfe adfacgdafa,gefgegbgfe adfacgdafa
1,",HI,gaebdfgfcc,HI,fbaaebgcfe,HI",",,gaebdfgfcc,,fbaaebgcfe,",",,gaebdfgfcc,,fbaaebgcfe,",HIgaebdfgfccHIfbaaebgcfeHI,gaebdfgfccfbaaebgcfe,HIgaebdfgfccHIfbaaebgcfeHI,HI gaebdfgfcc HI fbaaebgcfe HI,gaebdfgfcc fbaaebgcfe,gaebdfgfcc fbaaebgcfe
2,",UFF,cfgdcfcfba,UFF,gceecddfcd,UFF",",,cfgdcfcfba,,gceecddfcd,",",,cfgdcfcfba,,gceecddfcd,",UFFcfgdcfcfbaUFFgceecddfcdUFF,cfgdcfcfbagceecddfcd,UFFcfgdcfcfbaUFFgceecddfcdUFF,UFF cfgdcfcfba UFF gceecddfcd UFF,cfgdcfcfba gceecddfcd,cfgdcfcfba gceecddfcd
3,",HI,gabdgdaefb,HI,bdgfbccgee,HI",",,gabdgdaefb,,bdgfbccgee,",",,gabdgdaefb,,bdgfbccgee,",HIgabdgdaefbHIbdgfbccgeeHI,gabdgdaefbbdgfbccgee,HIgabdgdaefbHIbdgfbccgeeHI,HI gabdgdaefb HI bdgfbccgee HI,gabdgdaefb bdgfbccgee,gabdgdaefb bdgfbccgee
4,",HI,cgfcffbgeg,HI,dddebgfffd,HI",",,cgfcffbgeg,,dddebgfffd,",",,cgfcffbgeg,,dddebgfffd,",HIcgfcffbgegHIdddebgfffdHI,cgfcffbgegdddebgfffd,HIcgfcffbgegHIdddebgfffdHI,HI cgfcffbgeg HI dddebgfffd HI,cgfcffbgeg dddebgfffd,cgfcffbgeg dddebgfffd


time: 949 ms


## replace_str()

In [10]:
?pak.replace_str

time: 27.2 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mreplace_str[0m[0;34m([0m[0mseries[0m[0;34m,[0m [0mreplace_dict[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Replaces substrings from a Series of strings according to a dict.
* replace_dict: Example {'President Trump':'Trump', 'HELLO':'Hello'}
[0;31mFile:[0m      /media/me/DATA/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/string.py
[0;31mType:[0m      function


In [11]:
df = pak.dataframe([ ('The sun is bright',1),
                     ('The night is dark', 2) ])
df

Unnamed: 0,A,B
0,The sun is bright,1
1,The night is dark,2


time: 76.9 ms


In [15]:
translate = {'The':'THE', 'sun':'SUN', 'dark':'DARK'}
translate = [('The','THE'), ('sun','SUN'), ('dark','DARK')]
dict(translate )

{'The': 'THE', 'sun': 'SUN', 'dark': 'DARK'}

time: 27 ms


In [13]:
pak.replace_str( df.A, translate)

##


'HI'

time: 35.3 ms


In [14]:
df['A'] = pak.replace_str( df.A, translate)
df

##


Unnamed: 0,A,B
0,HI,1
1,HI,2


time: 34.6 ms
