In [1]:
import re

### Examples of Search Patterns

In [2]:
m = re.search(pattern="^Pl|ion$", string = "Pleural Effusion")

if(m):
    print(m.group(0))
else:
    print(None)

Pl


In [3]:
m = re.search(pattern = "^Sa|ion$", string = "Pleural Effusion")
if(m):
    print(m.group(0))
else:
    print(None)

ion


In [4]:
m = re.search(pattern="^Eff", string="Pleural Effusion")
if(m):
    print(m.group())
else:
    print(None)

None


### Characters in a Set [a-zA-Z]

**Look Behind**: (?<=)

In [5]:
m = re.search(pattern='[a-zA-Z]123', string = "99C123")
print(f"{m.group(0)}")

C123


In [6]:
m = re.search(pattern = '(?<=[a-zA-Z])123', string = "99C12399")
print(f"{m.group(0)}")

123


In [7]:
m = re.search(pattern = '123[a-zA-Z]', string = "99123C99")
print(f"{m.group(0)}")

123C


**Look Ahead**: (?=)

In [8]:
# Match 123 followed by a letter, exclude the letter from returned match.
m = re.search(pattern = '123(?=[a-zA-Z])', string = "99123C99")
print(f"{m.group(0)}")

123


### String Cleaning.

In [9]:
sentence = "     BIBASILAR OPACITIES,likely representing bilateral pleural effusions with ATELECTASIS   and/or PNEUMONIA/bronchopneumonia.."

In [10]:
sentence = sentence.lower()
sentence

'     bibasilar opacities,likely representing bilateral pleural effusions with atelectasis   and/or pneumonia/bronchopneumonia..'

In [11]:
sentence = re.sub('and/or', 'or', sentence)
sentence

'     bibasilar opacities,likely representing bilateral pleural effusions with atelectasis   or pneumonia/bronchopneumonia..'

In [12]:
sentence = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])', ' or ', sentence)
sentence

'     bibasilar opacities,likely representing bilateral pleural effusions with atelectasis   or pneumonia or bronchopneumonia..'

In [13]:
# Replace .. with . using re.sub (option 1)
tmp1 = re.sub("\.\.", ".", sentence)
print(tmp1)

# Replace .. with . using string.replace (option 2)
tmp2 = sentence.replace('..','.')
print(tmp2)

     bibasilar opacities,likely representing bilateral pleural effusions with atelectasis   or pneumonia or bronchopneumonia.
     bibasilar opacities,likely representing bilateral pleural effusions with atelectasis   or pneumonia or bronchopneumonia.


In [14]:
sentence = sentence.replace("..", ".")
sentence

'     bibasilar opacities,likely representing bilateral pleural effusions with atelectasis   or pneumonia or bronchopneumonia.'

In [15]:
# Define a dictionary to specify that ! is replaced by !!!
# and 's' is replaced by ''
translation_dict = {'!': '!!!',
                    'z': 's'
                   }
print(translation_dict)
# Create the translation table
translation_tbl = str.maketrans(translation_dict)
print(translation_tbl)

{'!': '!!!', 'z': 's'}
{33: '!!!', 122: 's'}


In [16]:
# Choose a string to be translated
tmp_str = "colonization, realization, organization!"
print(tmp_str)

# Translate the string using the translation table
tmp_str2 = tmp_str.translate(translation_tbl)
print(tmp_str2)

colonization, realization, organization!
colonisation, realisation, organisation!!!


In [17]:
# Creat translation table using a dictionary comprehension
translation_dict = {key: f"{key} " for key in ".,"}

# View the translation dictionary
display(translation_dict)

# View the translation dictionary with some formatting for easier reading
# Use vertical bars to help see the whitespace more easily.
for key, val in translation_dict.items():
    print(f"key: |{key}| \tval:|{val}|")

{'.': '. ', ',': ', '}

key: |.| 	val:|. |
key: |,| 	val:|, |


In [18]:
# Create the translation table using the translation dictionary
punctuation_spacer = str.maketrans(translation_dict)

# Apply the translation table to add whitespace after punctuation
sentence = sentence.translate(punctuation_spacer)
sentence

'     bibasilar opacities, likely representing bilateral pleural effusions with atelectasis   or pneumonia or bronchopneumonia. '

In [19]:
# Split the string using whitespace as the delimiter
# This removes all whitespace between words
sentence_list = sentence.split()
sentence_list

['bibasilar',
 'opacities,',
 'likely',
 'representing',
 'bilateral',
 'pleural',
 'effusions',
 'with',
 'atelectasis',
 'or',
 'pneumonia',
 'or',
 'bronchopneumonia.']

In [20]:
# Join the tokens with a single whitespace.
# This ensures that there is a single whitespace between words
sentence = ' '.join(sentence_list)
sentence

'bibasilar opacities, likely representing bilateral pleural effusions with atelectasis or pneumonia or bronchopneumonia.'

In [21]:
def clean(sentence):
    lower_sentence = sentence.lower()
    corrected_sentence = re.sub('and/or', 'or', lower_sentence)
    corrected_sentence = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])', ' or ', corrected_sentence)
    clean_sentence = corrected_sentence.replace("..", ".")
    punctuation_spacer = str.maketrans({key: f"{key} " for key in ".,"})
    clean_sentence = clean_sentence.translate(punctuation_spacer)
    clean_sentence = ' '.join(clean_sentence.split())
    return clean_sentence

sentences = ["     BIBASILAR OPACITIES,likely representing bilateral pleural effusions with ATELECTASIS   and/or PNEUMONIA..",
             "Small left pleural effusion/decreased lung volumes bilaterally.left RetroCardiac Atelectasis.",
             "PA  and lateral views of the chest demonstrate   clear lungs,with NO focal air space opacity and/or pleural effusion.",
             "worrisome nodule in the Right Upper  lobe.CANNOT exclude neoplasm.."]

for n, sentence in enumerate(sentences):
    print("\n##########################\n")
    print(f"Sentence number: {n+1}")
    print(f"Raw sentence: \n{sentence}")
    print(f"Cleaned sentence: \n{clean(sentence)}")


##########################

Sentence number: 1
Raw sentence: 
     BIBASILAR OPACITIES,likely representing bilateral pleural effusions with ATELECTASIS   and/or PNEUMONIA..
Cleaned sentence: 
bibasilar opacities, likely representing bilateral pleural effusions with atelectasis or pneumonia.

##########################

Sentence number: 2
Raw sentence: 
Small left pleural effusion/decreased lung volumes bilaterally.left RetroCardiac Atelectasis.
Cleaned sentence: 
small left pleural effusion or decreased lung volumes bilaterally. left retrocardiac atelectasis.

##########################

Sentence number: 3
Raw sentence: 
PA  and lateral views of the chest demonstrate   clear lungs,with NO focal air space opacity and/or pleural effusion.
Cleaned sentence: 
pa and lateral views of the chest demonstrate clear lungs, with no focal air space opacity or pleural effusion.

##########################

Sentence number: 4
Raw sentence: 
worrisome nodule in the Right Upper  lobe.CANNOT exclude n