# Text Processing Pipeline

This notebook contains functions required for text cleaning and processing pipeline in NLP problems.
These are ready-to-use functions and use NLTK and SKlearn packages.

We first install all required packages:

We now import the NLKT library and download all the supplementary data (note that this may take a fair amount of time!)

In [1]:
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
#nltk.download('all')

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string as st
import re
from nltk import PorterStemmer, WordNetLemmatizer

# Input data files are available in the read-only "./input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Read the data. Here it is already in .csv format.
dfs= pd.read_excel('../data/loinc_dataset-v2_1.xlsx', sheet_name=None)

#Concat the all sheets in a dataframe
df = pd.concat(dfs.values(), ignore_index=True)

df

Unnamed: 0,loinc_num,long_common_name,component,system,property
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein,Ser/Plas,MCnc
1,1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc
2,10331-7,Rh [Type] in Blood,Rh,Bld,Type
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility],Trimethoprim+Sulfamethoxazole,Isolate,Susc
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plasma,Bilirubin,Ser/Plas,MCnc
...,...,...,...,...,...
275,6290-1,Complement membrane C3b-C4b cofactor protein [...,Complement membrane C3b-C4b cofactor protein,WBC,PrThr
276,6294-3,Deprecated Complement C3d-C3d+Gg-IC3b receptor...,Complement C3d-C3d+Gg-IC3b receptors,WBC,EntNum
277,4496-6,Deprecated Complement C3d-C3d+Gg-IC3b receptor...,Complement C3d-C3d+Gg-IC3b receptors,WBC,PrThr
278,4525-2,Complement membrane C3b-C4b cofactor protein [...,Complement membrane C3b-C4b cofactor protein,WBC,EntNum


In [4]:
df.shape

(280, 5)

In [5]:
df.isna().sum()

loinc_num           0
long_common_name    0
component           0
system              0
property            0
dtype: int64

In [6]:
def join_columns(row):
    return ' '.join(row)

In [7]:
#Join the columns
df['combined_col'] = df[['long_common_name', 'component', 'system', "property"]].apply(join_columns, axis=1)

In [8]:
df = df.drop(["long_common_name", "component", "system", "property"], axis = 1)
df.head()

Unnamed: 0,loinc_num,combined_col
0,1988-5,C reactive protein [Mass/volume] in Serum or P...
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...
2,10331-7,Rh [Type] in Blood Rh Bld Type
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...


In [9]:
# add a new columns with 0 and 1 in random way
np.random.seed(123)
df['glucose'] = np.random.randint(2, size=len(df))
df['bilirubin'] = np.random.randint(2, size=len(df))
df['white_blood'] = np.random.randint(2, size=len(df))
df.head(20)

Unnamed: 0,loinc_num,combined_col,glucose,bilirubin,white_blood
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,0,0,0
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,1,0,0
2,10331-7,Rh [Type] in Blood Rh Bld Type,0,1,0
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,0,0,0
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,0,0,1
5,890-4,Blood group antibody screen [Presence] in Seru...,0,0,1
6,20565-8,"Carbon dioxide, total [Moles/volume] in Blood ...",0,1,0
7,18906-8,Ciprofloxacin [Susceptibility] Ciprofloxacin I...,1,0,1
8,2143-6,Cortisol [Mass/volume] in Serum or Plasma Cort...,1,1,1
9,2075-0,Chloride [Moles/volume] in Serum or Plasma Chl...,0,1,1


# Text cleaning and processing steps
* Remove punctuations
* Convert text to tokens
* Remove tokens of length less than or equal to 3
* Remove stopwords using NLTK corpus stopwords list to match
* Apply stemming
* Apply lemmatization
* Convert words to feature vectors

In [9]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [10]:
df['removed_punc'] = df['combined_col'].apply(lambda x: remove_punct(x))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...


In [11]:
''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
    on special characters, tabs or any other string based on which text is to be seperated into tokens.
'''
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [13]:
df['tokens'] = df['removed_punc'].apply(lambda msg : tokenize(msg))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc,tokens
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...,"[c, reactive, protein, massvolume, in, serum, ..."
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...,"[bicarbonate, molesvolume, in, blood, bicarbon..."
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type,"[rh, type, in, blood, rh, bld, type]"
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...,"[trimethoprimsulfamethoxazole, susceptibility,..."
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...,"[bilirubintotal, massvolume, in, serum, or, pl..."


An alternate method to tokenizing that resorts to resources provided by NLTK

In [14]:
from nltk.tokenize import word_tokenize

df['tokens'] = df['removed_punc'].apply(lambda msg : word_tokenize(msg.lower()))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc,tokens
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...,"[c, reactive, protein, massvolume, in, serum, ..."
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...,"[bicarbonate, molesvolume, in, blood, bicarbon..."
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type,"[rh, type, in, blood, rh, bld, type]"
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...,"[trimethoprimsulfamethoxazole, susceptibility,..."
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...,"[bilirubintotal, massvolume, in, serum, or, pl..."


In [15]:
# Remove tokens of length less than 3

def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [16]:
df['larger_tokens'] = df['tokens'].apply(lambda x : remove_small_words(x))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc,tokens,larger_tokens
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...,"[c, reactive, protein, massvolume, in, serum, ...","[reactive, protein, massvolume, serum, plasma,..."
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...,"[bicarbonate, molesvolume, in, blood, bicarbon...","[bicarbonate, molesvolume, blood, bicarbonate,..."
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type,"[rh, type, in, blood, rh, bld, type]","[type, blood, type]"
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...,"[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,..."
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...,"[bilirubintotal, massvolume, in, serum, or, pl...","[bilirubintotal, massvolume, serum, plasma, bi..."


In [17]:
''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
    list could be created and used to limit the matches in input text. 
'''
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [18]:
df['clean_tokens'] = df['larger_tokens'].apply(lambda x : remove_stopwords(x))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc,tokens,larger_tokens,clean_tokens
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...,"[c, reactive, protein, massvolume, in, serum, ...","[reactive, protein, massvolume, serum, plasma,...","[reactive, protein, massvolume, serum, plasma,..."
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...,"[bicarbonate, molesvolume, in, blood, bicarbon...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbonate, molesvolume, blood, bicarbonate,..."
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type,"[rh, type, in, blood, rh, bld, type]","[type, blood, type]","[type, blood, type]"
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...,"[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,..."
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...,"[bilirubintotal, massvolume, in, serum, or, pl...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintotal, massvolume, serum, plasma, bi..."


### Apply stemming to convert tokens to their root form. This is a rule-based process of word form conversion where word-suffixes are truncated irrespective of whether the root word is an actual word in the language dictionary. 
##### Note that this step is optional and depends on problem type.


In [19]:
# Apply stemming to get root words 
def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text]

In [20]:
df['stem_words'] = df['clean_tokens'].apply(lambda wrd: stemming(wrd))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc,tokens,larger_tokens,clean_tokens,stem_words
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...,"[c, reactive, protein, massvolume, in, serum, ...","[reactive, protein, massvolume, serum, plasma,...","[reactive, protein, massvolume, serum, plasma,...","[reactiv, protein, massvolum, serum, plasma, r..."
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...,"[bicarbonate, molesvolume, in, blood, bicarbon...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbon, molesvolum, blood, bicarbon, scnc]"
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type,"[rh, type, in, blood, rh, bld, type]","[type, blood, type]","[type, blood, type]","[type, blood, type]"
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...,"[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazol, suscept, trimeth..."
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...,"[bilirubintotal, massvolume, in, serum, or, pl...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintot, massvolum, serum, plasma, bilir..."


### Lemmatization converts word to it's dictionary base form. This process takes language grammar and vocabulary into consideration while conversion. Hence, it is different from Stemming in that it does not merely truncate the suffixes to get the root word.


In [22]:
# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [23]:
df['lemma_words'] = df['clean_tokens'].apply(lambda x : lemmatize(x))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc,tokens,larger_tokens,clean_tokens,stem_words,lemma_words
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...,"[c, reactive, protein, massvolume, in, serum, ...","[reactive, protein, massvolume, serum, plasma,...","[reactive, protein, massvolume, serum, plasma,...","[reactiv, protein, massvolum, serum, plasma, r...","[reactive, protein, massvolume, serum, plasma,..."
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...,"[bicarbonate, molesvolume, in, blood, bicarbon...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbon, molesvolum, blood, bicarbon, scnc]","[bicarbonate, molesvolume, blood, bicarbonate,..."
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type,"[rh, type, in, blood, rh, bld, type]","[type, blood, type]","[type, blood, type]","[type, blood, type]","[type, blood, type]"
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...,"[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazol, suscept, trimeth...","[trimethoprimsulfamethoxazole, susceptibility,..."
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...,"[bilirubintotal, massvolume, in, serum, or, pl...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintot, massvolum, serum, plasma, bilir...","[bilirubintotal, massvolume, serum, plasma, bi..."


Let us now annotate each token in a document with its Part-Of-Speech tag (note that tokenized FULL sentences are required!)

In [24]:
# Annotate each word with its part-of-speech tag

def get_pos_tag(tokenized_sentence):
    return nltk.pos_tag(tokenized_sentence)

In [25]:
df['pos_tag'] = df['tokens'].apply(lambda x : get_pos_tag(x))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc,tokens,larger_tokens,clean_tokens,stem_words,lemma_words,pos_tag
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...,"[c, reactive, protein, massvolume, in, serum, ...","[reactive, protein, massvolume, serum, plasma,...","[reactive, protein, massvolume, serum, plasma,...","[reactiv, protein, massvolum, serum, plasma, r...","[reactive, protein, massvolume, serum, plasma,...","[(c, NNS), (reactive, VBP), (protein, NN), (ma..."
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...,"[bicarbonate, molesvolume, in, blood, bicarbon...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbon, molesvolum, blood, bicarbon, scnc]","[bicarbonate, molesvolume, blood, bicarbonate,...","[(bicarbonate, NN), (molesvolume, NN), (in, IN..."
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type,"[rh, type, in, blood, rh, bld, type]","[type, blood, type]","[type, blood, type]","[type, blood, type]","[type, blood, type]","[(rh, NN), (type, NN), (in, IN), (blood, NN), ..."
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...,"[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazol, suscept, trimeth...","[trimethoprimsulfamethoxazole, susceptibility,...","[(trimethoprimsulfamethoxazole, JJ), (suscepti..."
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...,"[bilirubintotal, massvolume, in, serum, or, pl...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintot, massvolum, serum, plasma, bilir...","[bilirubintotal, massvolume, serum, plasma, bi...","[(bilirubintotal, JJ), (massvolume, NN), (in, ..."


In [26]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [27]:
df['clean_text'] = df['lemma_words'].apply(lambda x : return_sentences(x))
df.head()

Unnamed: 0,loinc_num,combined_col,removed_punc,tokens,larger_tokens,clean_tokens,stem_words,lemma_words,pos_tag,clean_text
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein Massvolume in Serum or Plas...,"[c, reactive, protein, massvolume, in, serum, ...","[reactive, protein, massvolume, serum, plasma,...","[reactive, protein, massvolume, serum, plasma,...","[reactiv, protein, massvolum, serum, plasma, r...","[reactive, protein, massvolume, serum, plasma,...","[(c, NNS), (reactive, VBP), (protein, NN), (ma...",reactive protein massvolume serum plasma react...
1,1959-6,Bicarbonate [Moles/volume] in Blood Bicarbonat...,Bicarbonate Molesvolume in Blood Bicarbonate B...,"[bicarbonate, molesvolume, in, blood, bicarbon...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbonate, molesvolume, blood, bicarbonate,...","[bicarbon, molesvolum, blood, bicarbon, scnc]","[bicarbonate, molesvolume, blood, bicarbonate,...","[(bicarbonate, NN), (molesvolume, NN), (in, IN...",bicarbonate molesvolume blood bicarbonate scnc
2,10331-7,Rh [Type] in Blood Rh Bld Type,Rh Type in Blood Rh Bld Type,"[rh, type, in, blood, rh, bld, type]","[type, blood, type]","[type, blood, type]","[type, blood, type]","[type, blood, type]","[(rh, NN), (type, NN), (in, IN), (blood, NN), ...",type blood type
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility]...,TrimethoprimSulfamethoxazole Susceptibility Tr...,"[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazole, susceptibility,...","[trimethoprimsulfamethoxazol, suscept, trimeth...","[trimethoprimsulfamethoxazole, susceptibility,...","[(trimethoprimsulfamethoxazole, JJ), (suscepti...",trimethoprimsulfamethoxazole susceptibility tr...
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plas...,Bilirubintotal Massvolume in Serum or Plasma B...,"[bilirubintotal, massvolume, in, serum, or, pl...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintotal, massvolume, serum, plasma, bi...","[bilirubintot, massvolum, serum, plasma, bilir...","[bilirubintotal, massvolume, serum, plasma, bi...","[(bilirubintotal, JJ), (massvolume, NN), (in, ...",bilirubintotal massvolume serum plasma bilirub...


### TF-IDF : Term Frequency - Inverse Document Frequency
#### The term frequency is the number of times a term occurs in a document. Inverse document frequency is an inverse function of the number of documents in which a given word occurs.
#### The product of these two terms gives tf-idf weight for a word in the corpus. The higher the frequency of occurrence of a word, lower is it's weight and vice-versa. This gives more weightage to rare terms in the corpus and penalizes more commonly occuring terms.
#### Other widely used vectorizer is Count vectorizer which only considers the frequency of occurrence of a word across the corpus.


In [28]:
# Convert lemmatized words to Tf-Idf feature vectors

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_vect = tfidf.fit_transform(df['clean_text'])
tfidf_vect.shape

(201, 135)

In [29]:
# Get feature names in the vector
tfidf.get_feature_names()



['aborh',
 'acnc',
 'activityvolume',
 'alanine',
 'albumin',
 'albuminglobulin',
 'alkaline',
 'aminotransferase',
 'ampicillin',
 'ampicillinsulbactam',
 'amplification',
 'amylase',
 'antibiotic',
 'antibody',
 'antiglobulin',
 'aspartate',
 'aureus',
 'aureusmethicillin',
 'bicarbonate',
 'bilirubin',
 'bilirubinatetotal',
 'bilirubindirect',
 'bilirubinglucuronidatedbilirubinalbumin',
 'bilirubinindirect',
 'bilirubinnonglucuronidated',
 'bilirubintotal',
 'bldbpu',
 'blood',
 'body',
 'bound',
 'calcium',
 'calciumcorrected',
 'calciumionized',
 'calculus',
 'carbon',
 'carcinoembryonic',
 'ccnc',
 'cefazolin',
 'cfst',
 'chloride',
 'cholesterol',
 'cholesterolin',
 'choriogonadotropin',
 'choriogonadotropinbeta',
 'ciprofloxacin',
 'cobalamin',
 'corrected',
 'cortisol',
 'crossmatch',
 'culture',
 'dioxide',
 'disposition',
 'dose',
 'enzymatic',
 'fasting',
 'fluid',
 'fraction',
 'gentamicin',
 'glucose',
 'glucose3h',
 'glucosepost',
 'group',
 'hematocrit',
 'hepatitis',
 