# Part 0: Import, Directory and Preference settings

In [1]:
# Import modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import random

In [2]:
# Set options

%matplotlib inline
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 30)
pd.options.display.max_columns=100
pd.options.display.max_rows=100

np.random.seed(42)

In [3]:
# ignore deprecation warnings in sklearn

import warnings
warnings.filterwarnings("ignore")

In [4]:
# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'Data')

In [5]:
# Set data paths

data_path = os.path.join(data_dir, 'Data')

train_path = os.path.join(data_dir, 'train.csv')

holdout_path = os.path.join(data_dir, 'test.csv')

In [6]:
# Set model path

model_dir = os.path.join(os.path.dirname(os.getcwd()), 'Model')

In [7]:
# Read the dataset

train = pd.read_csv(train_path)
holdout = pd.read_csv(holdout_path)

# Part 1: Basic Information

In [8]:
# Print the dimensions of both sets and the split percentage of training set

print(train.shape)
print(holdout.shape)
print(len(train)/(len(holdout) + len(train)))

(5279, 4)
(2924, 3)
0.6435450444959161


In [9]:
# Print the info columns of both sets

print(train.info())
print(holdout.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5279 entries, 0 to 5278
Data columns (total 4 columns):
unique_hash    5279 non-null object
text           5279 non-null object
drug           5279 non-null object
sentiment      5279 non-null int64
dtypes: int64(1), object(3)
memory usage: 165.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2924 entries, 0 to 2923
Data columns (total 3 columns):
unique_hash    2924 non-null object
text           2924 non-null object
drug           2924 non-null object
dtypes: object(3)
memory usage: 68.6+ KB
None


In [10]:
# View 5 first rows in the train set

print(train.head())

                                unique_hash  \
0  2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0   
1  9eba8f80e7e20f3a2f48685530748fbfa95943e4   
2  fe809672251f6bd0d986e00380f48d047c7e7b76   
3  bd22104dfa9ec80db4099523e03fae7a52735eb6   
4  b227688381f9b25e5b65109dd00f7f895e838249   

                                                text        drug  sentiment  
0  Autoimmune diseases tend to come in clusters. ...     gilenya          2  
1  I can completely understand why you’d want to ...     gilenya          2  
2  Interesting that it only targets S1P-1/5 recep...  fingolimod          2  
3  Very interesting, grand merci. Now I wonder wh...     ocrevus          2  
4  Hi everybody, My latest MRI results for Brain ...     gilenya          1  


In [11]:
# Consider the missingness of the data

print(np.sum(train.isna()))
print(np.sum(holdout.isna()))

unique_hash    0
text           0
drug           0
sentiment      0
dtype: int64
unique_hash    0
text           0
drug           0
dtype: int64


In [12]:
# Consider the classes of the values in each set

print(train.nunique())
print(holdout.nunique())

unique_hash    5279
text           5181
drug            102
sentiment         3
dtype: int64
unique_hash    2924
text           2721
drug             95
dtype: int64


# Part 2: Preprocessing Function

<div class="span5 alert alert-success">
We are most interested in the sentiment classes (our label) and the drug classes. It seems there are 102 drugs being reviewed about and 3 sentiments placed on them (positive, negative or neutral)
</div>

In [13]:
# Next, we must clean the data. First, let's take a random review

sample_text = train.text[random.randint(1,len(train))]
print(sample_text)

This is an update of the Cochrane review "Teriflunomide for multiple sclerosis" (first published in The Cochrane Library 2012, Issue 12).Multiple sclerosis (MS) is a chronic immune-mediated disease of the central nervous system. It is clinically characterized by recurrent relapses or progression, or both, often leading to severe neurological disability and a serious decline in quality of life. Disease-modifying therapies (DMTs) for MS aim to prevent occurrence of relapses and disability progression. Teriflunomide is a pyrimidine synthesis inhibitor approved by both the US Food and Drug Administration (FDA) and the European Medicines Agency (EMA) as a DMT for adults with relapsing-remitting MS (RRMS).  OBJECTIVES: To assess the absolute and comparative effectiveness and safety of teriflunomide as monotherapy or combination therapy versus placebo or other disease-modifying drugs (DMDs) (interferon beta (IFNβ), glatiramer acetate, natalizumab, mitoxantrone, fingolimod, dimethyl fumarate, 

In [14]:
# Define cleaning modules and cleaning functions

import re
import nltk

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from custom_function.contractions import CONTRACTION_MAP
from unicodedata import normalize

# Import nltk resources
resources = ["wordnet", "stopwords", "punkt", \
             "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]

for resource in resources:
    try:
        nltk.data.find("tokenizers/" + resource)
    except LookupError:
        nltk.download(resource)

# Create stopwords list        
STOPWORDS = set(stopwords.words('english'))

# Define lemmatizing functions
def lemmatize_doc(document):
    """ 
    Conduct pre-processing, tag words then returns sentence with lemmatized words
    """
    
    # Create an empty list of lemmatized tokens
    lemmatized_list = []
    
    # Tokenize the sentences
    tokenized_sent = sent_tokenize(document)
    
    # Iterate over sentences to conduct lemmatization
    for sentence in tokenized_sent:
        
        # Tokenize the words in the sentence
        tokenized_word = word_tokenize(sentence)
        
        # Tag the pos of the tokens
        tagged_token = pos_tag(tokenized_word)
        
        # Initialize a empty list of lemmatized words
        root = []

        # Create Lemmatizer object
        lemma = WordNetLemmatizer()

        # iterate over the tagged sentences to 
        for token in tagged_token:

            # assign tag and actual word of the token
            tag = token[1][0]
            word = token[0]

            # Lemmatize the token based on tags
            if tag.startswith('J'):
                root.append(lemma.lemmatize(word, wordnet.ADJ))
            elif tag.startswith('V'):
                root.append(lemma.lemmatize(word, wordnet.VERB))
            elif tag.startswith('N'):
                root.append(lemma.lemmatize(word, wordnet.NOUN))
            elif tag.startswith('R'):
                root.append(lemma.lemmatize(word, wordnet.ADV))
            else:          
                root.append(word)

        # Add the lemmatized word into our list
        lemmatized_list.extend(root)
        
    return " ".join(lemmatized_list)

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions form to create cohenrent extractions
    """
    
    # Substitute quotation marks with apostrophes
    text = re.sub("’", "'", text)
    
    # define the contraction pattern with custom contraction mappings
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    
    # Define function to expand contraction matches
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# Define main text cleaning function
def clean_text(text):
    """
    Return a processed version of the text given
    """
    # Turn all text into lower case
    text = text.lower()
    
    # Expand all contractions
    text = expand_contractions(text)
    
    # Remove all links
    text = re.sub(r'www.[^ ]+', '', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)
    
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    
    # Remove all punctuations, except hyphens
    text = re.sub(r"[%;$`“”\,.!?():\[\]\/]", ' ', text)

    # Remove all shortened words (like " d " from "you'd")
    #no_shorten = re.sub(r"\s[dtms]\s", ' ', no_punctuation)
    #no_shorten = re.sub(r"\sre\s", ' ', no_shorten)

    # Remove all numerics stands by itself
    text = re.sub(r"(?<=\s)\d+(?=\s)", ' ', text)

    # Lemmatize text
    text = lemmatize_doc(text)
    
    # Remove stand-alone hyphens
    text = re.sub(r"\s-\s", ' ', text)
    
    # Removing Extra spaces
    text = re.sub(r'[\s]+', ' ', text)
    
    # Convert 
    return text
    

[nltk_data] Downloading package wordnet to C:\Users\Zach
[nltk_data]     Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Zach
[nltk_data]     Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Zach Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     C:\Users\Zach Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!


In [15]:
# Check the sample text after cleaning
clean_text(sample_text)

"update cochrane review `` teriflunomide multiple sclerosis '' first publish cochrane library issue multiple sclerosis m chronic immune-mediated disease central nervous system clinically characterize recurrent relapse progression both often lead severe neurological disability serious decline quality life disease-modifying therapy dmts ms aim prevent occurrence relapse disability progression teriflunomide pyrimidine synthesis inhibitor approve us food drug administration fda european medicine agency ema dmt adult relapsing-remitting m rrms objectives assess absolute comparative effectiveness safety teriflunomide monotherapy combination therapy versus placebo disease-modifying drug dmds interferon beta ifnβ glatiramer acetate natalizumab mitoxantrone fingolimod dimethyl fumarate alemtuzumab modify disease course people ms search method search cochrane multiple sclerosis rare diseases cns group specialise trial register september checked reference list publish review retrieve article sear

In [16]:
%%time
# Apply the pre-processing algorithm into a new dataframe: train_processed

train_processed = train[['text', 'drug', 'sentiment']]
holdout_processed = holdout[['text', 'drug']]

train_processed.text = train_processed.text.apply(clean_text)
holdout_processed.text = holdout_processed.text.apply(clean_text)

Wall time: 4min 58s


In [25]:
train_processed = train_processed.sort_index()

In [28]:
train_processed.head()

Unnamed: 0,text,drug,sentiment
0,autoimmune disease tend come cluster gilenya –...,gilenya,2
1,completely understand would want try it but re...,gilenya,2
2,interest target s1p-1 receptor rather 1-5 like...,fingolimod,2
3,interesting grand merci wonder lemtrada ocrevu...,ocrevus,2
4,hi everybody late mri result brain cervical co...,gilenya,1


In [26]:
train_processed.to_json(os.path.join(data_dir, 'interim', 'train_preprocessed.txt'))
holdout_processed.to_json(os.path.join(data_dir, 'interim', 'holdout_preprocessed.txt'))