# DATA EXPLORATION

Import libraries

In [54]:
import pandas as pd
import nltk

# For stop word filtering
#nltk.download("stopwords")
from nltk.corpus import stopwords

from nltk.tokenize import RegexpTokenizer, word_tokenize
#nltk.download('punkt_tab')

from nltk.stem import WordNetLemmatizer

Load input files

In [28]:
# Train data path
train_df = "../data/medical_tc_train.csv"
# Test data path
test_df = "../data/medical_tc_test.csv"
# Labels path
labels_df = "../data/medical_tc_labels.csv"

# Open train data
train_df = pd.read_csv(train_df)
test_df = pd.read_csv(test_df)
labels_df = pd.read_csv(labels_df)

# Visualize top 10 rows
#train_df.head(10)
test_df.head(10)
#labels_df.head(10)

Unnamed: 0,condition_label,medical_abstract
0,3,Obstructive sleep apnea following topical orop...
1,5,Neutrophil function and pyogenic infections in...
2,5,A phase II study of combined methotrexate and ...
3,1,Flow cytometric DNA analysis of parathyroid tu...
4,4,Paraneoplastic vasculitic neuropathy: a treata...
5,1,Treatment of childhood angiomatous diseases wi...
6,1,Expression of major histocompatibility complex...
7,1,Questionable role of CNS radioprophylaxis in t...
8,5,Reversibility of hepatic fibrosis in experimen...
9,2,Current status of duplex Doppler ultrasound in...


Add labels to train data for posterior visualization

In [35]:
train_df = pd.merge(train_df,labels_df, how = "left", on = "condition_label")

# Visualize top rows
train_df.head()

Unnamed: 0,condition_label,medical_abstract,condition_name
0,5,Tissue changes around loose prostheses. A cani...,general pathological conditions
1,1,Neuropeptide Y and neuron-specific enolase lev...,neoplasms
2,2,"Sexually transmitted diseases of the colon, re...",digestive system diseases
3,1,Lipolytic factors associated with murine and h...,neoplasms
4,3,Does carotid restenosis predict an increased r...,nervous system diseases


## 1. Text Processing
### 1.1. Lower text and remove stopwords / punctuation
Stop words are frequently found in all kind of texts since thay contribute to the sentence structure and meaning. However, they should be removed (depending on the analysis) to reduce the noise they might produce. Also, after removing them we will have a reduced text amount to process (data size reduction) and an improved performance. Besides, to have a more homogenous text it is necessary to lower all the text and remove punctuations.

In [None]:
# Define stop words
stop_words = set(stopwords.words("english"))

Define function to remove stop words from data

In [55]:
def remove_stopwords_punctutation(data_df):

    # Define tokenizer
    tokenizer = RegexpTokenizer(r"\w+")
    
    # Initialize loop to iterate through
    for i, row in data_df.iterrows():
    
        # Get cell
        text = row["medical_abstract"]
    
        # Check if the cell contains text
        if isinstance(text, str):
        
            # Lower and tokenize text
            tokens = tokenizer.tokenize(text.lower())
        
            # Remove stop words
            tokens_clean = [token for token in tokens if token not in stop_words]
        
            # Detokenize text
            cleaned_sentence = " ".join(tokens_clean)

            # Save processed text
            data_df.loc[i, "medical_abstract"] = cleaned_sentence

    # Return cleaned text
    return data_df

Clean text

In [57]:
# Train data
train_clean_df = remove_stopwords_punctutation(train_df)
# Test data
test_clean_df = remove_stopwords_punctutation(test_df.drop(columns = ["condition_label"]))

# Visualize top rows
#train_clean_df.head()
test_clean_df.head()

Unnamed: 0,medical_abstract
0,obstructive sleep apnea following topical orop...
1,neutrophil function pyogenic infections bone m...
2,phase ii study combined methotrexate teniposid...
3,flow cytometric dna analysis parathyroid tumor...
4,paraneoplastic vasculitic neuropathy treatable...


### 1.2. Lemmatize
Lemmatizing reduce the words to their core meaning. For example, for words such as "do", "doing" and "done", lemmatizing will substitute them by "do", their core word. This process again reduces the amount of text to process (amount of total unique words), mak