# Text Processing Pipeline

In [None]:
import nltk
import re

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string as st

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# This may take a while...
nltk.download('all')

In [None]:
dfs = pd.read_excel('../data/loinc_dataset-v2_1.xlsx', sheet_name=None)

# Concat the all sheets in a dataframe
df = pd.concat(dfs.values(), ignore_index=True)

df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
# Join the columns
df['combined_col'] = df['long_common_name'] \
    + df['component'] \
    + df['system'] \
    + ['property']

# Text cleaning and processing steps

In [None]:
# Remove all punctuations from the text
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in st.punctuation])

In [None]:
df['removed_punc'] = df['combined_col'].apply(remove_punct)
df.head()

In [None]:
df['tokens'] = df['removed_punc'].str.lower().str.split(r'\s+')
df.head()

In [None]:
''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
    list could be created and used to limit the matches in input text. 
'''
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [None]:
df['clean_tokens'] = df['tokens'].apply(remove_stopwords)
df.head()

Let us now annotate each token in a document with its Part-Of-Speech tag (note that tokenized FULL sentences are required!)

In [None]:
df['pos_tag'] = df['tokens'].apply(nltk.pos_tag)
df.head()

In [None]:
# Create sentences to get clean text as input for vectors
def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [None]:
df['clean_text'] = df['loinc_num'] + ' ' + df['clean_tokens'].apply(return_sentences)
df.head()

### TF-IDF : Term Frequency - Inverse Document Frequency

In [None]:
# Convert lemmatized words to Tf-Idf feature vectors

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_vect = tfidf.fit_transform(df['clean_text'])
tfidf_vect.shape

In [None]:
df_d = df.drop_duplicates(['loinc_num'])
coef = df_d.shape[0]

In [None]:
df_t = pd.concat([df_d, df_d, df_d], ignore_index=True)
df_t['query'] = ''
df_t.loc[df_t.index.isin(range(0, coef)), 'query'] = 'glucose in blood'
df_t.loc[df_t.index.isin(range(coef, 2 * coef)), 'query'] = 'bilirubin in plasma'
df_t.loc[df_t.index.isin(range(2 * coef, 3 * coef)), 'query'] = 'white blood cells count'

In [None]:
df_t['query'].value_counts()

In [None]:
df_t['rank'] = np.random.randint(2, size=len(df_t))

In [None]:
df_t.to_csv('../data/preprocessed_toy_dataset.csv')