<a href="https://colab.research.google.com/github/drwitt/NLP_IDS_690-03/blob/master/Viggy_RQE_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# RQE Analysis
# Viggy Kumaresan

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as ET
import nltk
import re

Before starting, first download XML files from Github repository (https://github.com/abachaa/RQE_Data_AMIA2016) and upload to Colab. 

In [0]:
# Import Train and Text XML files
def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = ET.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    
    for node in xroot: 
        res = []
        res.append(node.attrib.get(df_cols[0]))
        res.append(node.attrib.get(df_cols[1]))
        res.append(node.attrib.get(df_cols[2]))
        for el in df_cols[3:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text)
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
        
    return out_df

In [27]:
from google.colab import files
uploaded = files.upload()

Saving RQE_Test_302_pairs_AMIA2016.xml to RQE_Test_302_pairs_AMIA2016.xml


In [0]:
train = parse_XML('/content/RQE_Train_8588_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])

In [19]:
train.head()

Unnamed: 0,pid,type,value,chq,faq
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...


In [0]:
test = parse_XML('/content/RQE_Test_302_pairs_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])

In [30]:
test.head()

Unnamed: 0,pid,type,value,chq,faq
0,1,part1,False,High Blood Pressure. I know you may not answer...,What is High Blood Pressure?
1,2,part1,False,Arrhythmia. can arrhythmia occurs after ablati...,What is an Arrhythmia?
2,3,part1,False,medicine and allied. I LIKE TO KNOW RECENT THE...,What is an Arrhythmia?
3,4,part1,False,EAR LOBE CREASES. Are ear lobe creases always ...,What is Coronary Heart Disease?
4,5,part1,False,sleep apnea. I was diagnosed with sleep apnea ...,What is Sleep Apnea?


In [31]:
# Binarize outcome variable
train['outcome'] = np.where(train['value'] == 'true', 1, 0)
test['outcome'] = np.where(test['value'] == 'true', 1, 0)
train.head()

Unnamed: 0,pid,type,value,chq,faq,outcome
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...,1
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...,1
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...,0
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...,1
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...,0


## Text Preprocessing


In [32]:
# Remove punctuation and non-necessary characters
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

def preprocess_text(text):
  text = re.sub('[^A-Za-z]', ' ', text)

  # Convert all to lowercase
  text = text.lower()

  # Tokenize
  tokenized_text = word_tokenize(text)

  # Remove stopwords
  for word in tokenized_text:
      if word in stopwords.words('english'):
          tokenized_text.remove(word)

  # Stem
  stemmer = PorterStemmer()
  for i in range(len(tokenized_text)):
      tokenized_text[i] = stemmer.stem(tokenized_text[i])

  # List of words
  p_text = " ".join(tokenized_text)

  return p_text

train_processed = train.copy()
X_train = train_processed[['chq', 'faq']]
y_train = train_processed['outcome']

X_train['chq'] = train.apply(lambda x: preprocess_text(x['chq']), axis=1)
X_train['faq'] = train.apply(lambda x: preprocess_text(x['faq']), axis=1)

test_processed = test.copy()
X_test = test_processed[['chq', 'faq']]
y_test = test_processed['outcome']

X_test['chq'] = test.apply(lambda x: preprocess_text(x['chq']), axis=1)
X_test['faq'] = test.apply(lambda x: preprocess_text(x['faq']), axis=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [33]:
X_train.head()

Unnamed: 0,chq,faq
0,should treat polymenorrhea a year old girl,should treat polymenorrhea a year old girl
1,there ani studi low molecular weight heparin p...,i use low molecular weight heparin pregnanc pa...
2,there ani studi low molecular weight heparin p...,are side effect florinef could caus headach
3,let give immun s right t,let give immun s right t
4,let give immun s right t,there support can provid patient macular degener


In [34]:
X_test.head()

Unnamed: 0,chq,faq
0,high blood pressur know may answer but blood p...,is high blood pressur
1,arrhythmia arrhythmia occur ablat is success r...,is arrhythmia
2,medicin alli like know recent therapi arrhythm...,is arrhythmia
3,ear lobe creas ear lobe creas alway heart dise...,is coronari heart diseas
4,sleep apnea wa diagnos sleep apnea prolli year...,is sleep apnea


## BoW

In [35]:
# chq text
from sklearn.feature_extraction.text import CountVectorizer
X_train_chq = X_train.chq
matrix = CountVectorizer(min_df=5, max_df=0.7).fit(X_train_chq)
X_train_chq = pd.DataFrame(matrix.transform(X_train_chq).todense(), columns=matrix.get_feature_names())
X_train_chq.shape 

(8588, 2417)

In [36]:
# transform X_test (DON'T FIT)
X_test_chq = X_test.chq
X_test_chq = pd.DataFrame(matrix.transform(X_test_chq).todense(), columns=matrix.get_feature_names())
X_test_chq.shape 

(302, 2417)

In [37]:
# faq text
X_train_faq = X_train.faq
matrix = CountVectorizer(min_df=5, max_df=0.7).fit(X_train_faq)
X_train_faq = pd.DataFrame(matrix.transform(X_train_faq).todense(), columns=matrix.get_feature_names())
X_test_faq = X_test.faq
X_test_faq = pd.DataFrame(matrix.transform(X_test_faq).todense(), columns=matrix.get_feature_names())

# concatenate
X_train_bow = pd.concat([X_train_chq, X_train_faq], axis=1)
X_test_bow = pd.concat([X_test_chq, X_test_faq], axis=1)
print(X_train.shape)
print(X_test.shape)

(8588, 2)
(302, 2)


In [38]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train_bow, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test_bow)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4900662251655629


## Fuzzy Matching

In [39]:
!pip3 install fuzzywuzzy[speedup]
from fuzzywuzzy import fuzz

def get_ratio(row):
    chq = row['chq']
    faq = row['faq']
    return fuzz.token_set_ratio(chq, faq)
  
X_train_fuzzy = X_train.copy()
X_test_fuzzy = X_test.copy()

X_train_fuzzy['fuzzy_ratio'] = X_train_fuzzy.apply(get_ratio, axis=1)
X_train_fuzzy.head()

Collecting fuzzywuzzy[speedup]
  Downloading https://files.pythonhosted.org/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl
Collecting python-levenshtein>=0.12; extra == "speedup" (from fuzzywuzzy[speedup])
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |████████████████████████████████| 51kB 3.8MB/s 
Building wheels for collected packages: python-levenshtein
  Building wheel for python-levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-levenshtein: filename=python_Levenshtein-0.12.0-cp36-cp36m-linux_x86_64.whl size=144674 sha256=9b10498c5e5691d7dfaa28b37232fbb5773ff453238b9cfc89c7d7a443819450
  Stored in directory: /root/.cache/pip/wheels/de/c2/93/660fd5f7559049268ad2dc6d81c4e39e9e36518766eaf7e342
Successfully built python-levenshtein
Installing collected packages: python-l

Unnamed: 0,chq,faq,fuzzy_ratio
0,should treat polymenorrhea a year old girl,should treat polymenorrhea a year old girl,100
1,there ani studi low molecular weight heparin p...,i use low molecular weight heparin pregnanc pa...,100
2,there ani studi low molecular weight heparin p...,are side effect florinef could caus headach,32
3,let give immun s right t,let give immun s right t,100
4,let give immun s right t,there support can provid patient macular degener,42


In [0]:
X_test_fuzzy['fuzzy_ratio'] = X_test_fuzzy.apply(get_ratio, axis=1)

In [0]:
# just use fuzzy ratio as predictor
X_train_fuzzy = np.array(X_train_fuzzy.fuzzy_ratio).reshape(-1, 1)
X_test_fuzzy = np.array(X_test_fuzzy.fuzzy_ratio).reshape(-1, 1)

In [42]:
print(X_train_fuzzy.shape)
print(X_test_fuzzy.shape)
print(y_train.shape)
print(y_test.shape)

(8588, 1)
(302, 1)
(8588,)
(302,)


In [43]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train_fuzzy, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test_fuzzy)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6920529801324503


In [44]:
unique, counts = np.unique(y_pred, return_counts=True)
dict(zip(unique, counts))

{0: 192, 1: 110}

### Accuracy Metrics
Ratio: 0.583

Partial Ratio: 0.656

Token Sort Ratio (ignores word order): 0.589

Token Set Ratio (ignores duplicate words): 0.692


## Rules Based Hypothesis Testing

•	Take the RQE data: https://raw.githubusercontent.com/abachaa/RQE_Data_AMIA2016/master/RQE_Train_8588_AMIA2016.xml 

•	Divide randomly  this data into training and testing (test data set should have at least 1500 data points). Do not use the original test data.

•	Using only regular expressions, counting, fraction and other simple arithmetic, create a collection of  if-then-else clauses to build a classifier that significantly improves on the majority classifier.


•	I suggest each person independently should come with such rules, and note the reason for them, and then the group combines it intelligently into a tree of rules. 

•	Only use your test data to get accuracy numbers; do not look at them to modify your rules. 


In [45]:
new_data = parse_XML('/content/RQE_Train_8588_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
new_data.head()

Unnamed: 0,pid,type,value,chq,faq
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...


In [46]:
print(new_data.shape)

(8588, 5)


In [47]:
1500/8588

0.17466231951560315

In [48]:
# clean up text

def preprocess_text(text):
  text = re.sub('[^A-Za-z]', ' ', text)

  # Convert all to lowercase
  text = text.lower()

  return text


new_data['chq'] = new_data.apply(lambda x: preprocess_text(x['chq']), axis=1)
new_data['faq'] = new_data.apply(lambda x: preprocess_text(x['faq']), axis=1)

new_data.head()

Unnamed: 0,pid,type,value,chq,faq
0,1,originalQ-shortQ,True,how should i treat polymenorrhea in a y...,how should i treat polymenorrhea in a y...
1,2,originalQ-shortQ,True,have there been any studies with low molec...,can i use low molecular weight heparin in ...
2,3,originalQ-shortRandQ,False,have there been any studies with low molec...,what are the side effects of florinef co...
3,4,originalQ-shortQ,True,let s give these immunizations that s ri...,let s give these immunizations that s ri...
4,5,originalQ-shortRandQ,False,let s give these immunizations that s ri...,is there more support we can provide patie...


In [49]:
# Binarize outcome variable
y = pd.DataFrame()
y['outcome'] = np.where(new_data['value'] == 'true', 1, 0)
y.head()

Unnamed: 0,outcome
0,1
1,1
2,0
3,1
4,0


In [0]:
# Divide train data into train and test (validation)
# 80-20 split
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(new_data, y, test_size=0.2, random_state=42)

In [0]:
x_train.shape

(6870, 5)

In [51]:
x_train['outcome'] = np.where(x_train['value'] == 'true', 1, 0)
x_test['outcome'] = np.where(x_test['value'] == 'true', 1, 0)
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,pid,type,value,chq,faq,outcome
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1


In [52]:
# majority classifier - training
x_train['outcome'].value_counts()

1    3739
0    3131
Name: outcome, dtype: int64

In [53]:
print('Majority Classifier train accuracy:', 3739/(3739+3131))

Majority Classifier train accuracy: 0.5442503639010189


In [55]:
x_test.shape

(1718, 6)

In [56]:
# majority classifier
y_test['outcome'].value_counts()

1    916
0    802
Name: outcome, dtype: int64

In [57]:
print('Majority Classifier test accuracy:', 916/(916+802))

Majority Classifier test accuracy: 0.5331781140861467


# Rule 1

Rule #1: Find words that exist in both the chq and faq. If this number is greater than certain threshold, then predict 1. Else, predict 0.

In [58]:
x_train.head()

Unnamed: 0,pid,type,value,chq,faq,outcome
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1


In [59]:
x_train['exists_both'] = [set(x[3].split()) & set(x[4].split()) for x in x_train.values]
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1,"{influenza, a, with}"
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0,{to}
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1,"{year, negative, purified, old, of, protein, c..."
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0,{do}
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1,"{this, echocardiogram, patient, attacks, trans..."


In [60]:
x_train['exists_both_len'] = x_train['exists_both'].apply(len)
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1,"{influenza, a, with}",3
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0,{to},1
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1,"{year, negative, purified, old, of, protein, c...",20
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0,{do},1
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1,"{this, echocardiogram, patient, attacks, trans...",16


In [61]:
x_train['exists_both_len'].describe()

count    6870.000000
mean        5.736827
std         5.117859
min         0.000000
25%         1.000000
50%         4.500000
75%        10.000000
max        30.000000
Name: exists_both_len, dtype: float64

In [62]:
x_train['exist_feature'] = np.where(x_train['exists_both_len'] > 3, 1, 0)
x_train['exist_feature'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


1    3738
0    3132
Name: exist_feature, dtype: int64

In [63]:
train_pred = x_train[x_train['exist_feature'] == x_train['outcome']]
train_pred.shape

(6451, 9)

In [65]:
train_pred.head()

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
4205,4206,originalQ-shortRandQ,False,is a history of pulmonary embolus a contra...,should we switch to ancef for this diabeti...,0,{to},1,0
3889,3890,originalQ-shortQ,True,should you treat a year old with a posi...,should you treat a year old with a posi...,1,"{year, negative, purified, old, of, protein, c...",20,1
2357,2358,originalQ-shortRandQ,False,do we need to worry about thrombocytopenia...,how do you inject the bicipital tendon,0,{do},1,0
58,59,originalQ-shortQ,True,in this patient with tias transient ische...,in this patient with transient ischemic at...,1,"{this, echocardiogram, patient, attacks, trans...",16,1
2303,2304,originalQ-shortQ,True,what is the reference for the article on a...,what is the reference to the article on ar...,1,"{article, arthrocentesis, on, technique, refer...",8,1


In [66]:
incorrect_train_pred = x_train[x_train['exist_feature'] != x_train['outcome']]
incorrect_train_pred.shape

(419, 9)

In [67]:
incorrect_train_pred.head()

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1,"{influenza, a, with}",3,0
156,157,originalQ-shortRandQ,False,how soon should you ambulate a patient wit...,can i use low molecular weight heparin in ...,0,"{thrombosis, vein, patient, deep, with}",5,1
2057,2058,originalQ-shortRandQ,False,what are the causes of and how do you work...,what is the incubation period of influenza...,0,"{a, of, the, what}",4,1
6885,6886,originalQ-shortRandQ,False,year old woman complains that her heart...,how should i treat polymenorrhea in a y...,0,"{year, old, a, i, in}",5,1
6400,6401,originalQ-shortRandQ,False,i had a guy year old man with subclav...,what is the cause and treatment of this ol...,0,"{man, old, the, and}",4,1


In [68]:
print('Majority Classifier Train Accuracy:',3739/(3739+3131))
print('Exact Match > 0 Classifier Train Accuracy:',6451/(3739+3131))

Majority Classifier Train Accuracy: 0.5442503639010189
Exact Match > 0 Classifier Train Accuracy: 0.9390101892285299


In [69]:
# Test classifier
x_test['exists_both'] = [set(x[3].split()) & set(x[4].split()) for x in x_test.values]
x_test['exists_both_len'] = x_test['exists_both'].apply(len)
x_test['exist_feature'] = np.where(x_test['exists_both_len'] > 3, 1, 0)
pred = x_test[x_test['exist_feature'] == x_test['outcome']]
pred.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(1609, 9)

In [70]:
incorrect_pred = x_test[x_test['exist_feature'] != x_test['outcome']]
incorrect_pred.shape

(109, 9)

In [71]:
x_test['outcome'].value_counts()

1    916
0    802
Name: outcome, dtype: int64

In [72]:
print('Majority Classifier Test Accuracy:',916/(916+802))
print('Exact Match > 0 Classifier Test Accuracy:',1609/(916+802))

Majority Classifier Test Accuracy: 0.5331781140861467
Exact Match > 0 Classifier Test Accuracy: 0.9365541327124564


Accuracy improved from ~53% -> ~93% by applying the rule:

(number of exact words matching > 3) == 1, else 0

# Rule 2

Let's look at our incorrect predictions to then improve our classifier with our next rule.

In [0]:
incorrect_train_pred.head(20)

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
6569,6570,originalQ-shortQ,True,year old woman presented yesterday with...,should a patient with influenza symptoms b...,1,"{influenza, a, with}",3,0
156,157,originalQ-shortRandQ,False,how soon should you ambulate a patient wit...,can i use low molecular weight heparin in ...,0,"{deep, thrombosis, vein, with, patient}",5,1
2057,2058,originalQ-shortRandQ,False,what are the causes of and how do you work...,what is the incubation period of influenza...,0,"{a, what, the, of}",4,1
6885,6886,originalQ-shortRandQ,False,year old woman complains that her heart...,how should i treat polymenorrhea in a y...,0,"{i, year, in, old, a}",5,1
6400,6401,originalQ-shortRandQ,False,i had a guy year old man with subclav...,what is the cause and treatment of this ol...,0,"{the, and, man, old}",4,1
333,334,originalQ-shortRandQ,False,it s not crystal clear what s going on wit...,how should i treat polymenorrhea in a y...,0,"{a, i, in, old, treat, year}",6,1
752,753,originalQ-shortRandQ,False,what is the significance of haemophilus ae...,what is the dose of sporanox,0,"{is, what, the, of}",4,1
351,352,originalQ-shortQ,True,what is legatrin,what is legatrin,1,"{is, legatrin, what}",3,0
2996,2997,originalQ-shortRandQ,False,what is the upper limit of normal of small...,what is that new drug like prilosec it h...,0,"{is, what, the, of}",4,1
1412,1413,originalQ-shortRandQ,False,is keflex the drug of choice for this pati...,is serzone okay to give to with a partial ...,0,"{is, to, a, with}",4,1


In [0]:
incorrect_train_pred['outcome'].value_counts()

1    210
0    209
Name: outcome, dtype: int64

It seems that a lot of our incorrect predictions are due to the fact that non-relevant words are being matched. We can try to improve this by removing stopwords.

In [73]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize


def tokenize_and_remove_stopwords(text):
  # Tokenize
  tokenized_text = word_tokenize(text)

  # Remove stopwords
  for word in tokenized_text:
      if word in stopwords.words('english'):
          tokenized_text.remove(word)
  return tokenized_text

x_train['chq'] = x_train.apply(lambda x: tokenize_and_remove_stopwords(x['chq']), axis=1)
x_train['faq'] = x_train.apply(lambda x: tokenize_and_remove_stopwords(x['faq']), axis=1)

x_train.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
6569,6570,originalQ-shortQ,True,"[year, old, woman, presented, yesterday, a, pe...","[a, patient, influenza, symptoms, treated, the...",1,"{influenza, a, with}",3,0
4205,4206,originalQ-shortRandQ,False,"[history, pulmonary, embolus, a, contraindicat...","[we, switch, ancef, this, diabetic, foot, ulce...",0,{to},1,0
3889,3890,originalQ-shortQ,True,"[you, treat, year, old, a, positive, ppd, puri...","[you, treat, year, old, a, positive, ppd, puri...",1,"{year, negative, purified, old, of, protein, c...",20,1
2357,2358,originalQ-shortRandQ,False,"[we, need, worry, thrombocytopenia, other, sid...","[do, inject, bicipital, tendon]",0,{do},1,0
58,59,originalQ-shortQ,True,"[this, patient, tias, transient, ischemic, att...","[this, patient, transient, ischemic, attacks, ...",1,"{this, echocardiogram, patient, attacks, trans...",16,1


In [74]:
# Re-calculate common text feature
x_train['exists_both'] = [set(x[3]) & set(x[4]) for x in x_train.values]
x_train['exists_both_len'] = x_train['exists_both'].apply(len)
x_train['exist_feature'] = np.where(x_train['exists_both_len'] > 1, 1, 0)

train_pred_2 = x_train[x_train['exist_feature'] == x_train['outcome']]
train_pred_2.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(6582, 9)

In [75]:
incorrect_train_pred_2 = x_train[x_train['exist_feature'] != x_train['outcome']]
incorrect_train_pred_2.shape

(288, 9)

In [76]:
print('Majority Classifier Train Accuracy:',3739/(3739+3131))
print('Exact Match w/no stopwords > 1 Classifier Train Accuracy:',6582/(3739+3131))

Majority Classifier Train Accuracy: 0.5442503639010189
Exact Match w/no stopwords > 1 Classifier Train Accuracy: 0.9580786026200874


In [77]:
# clean test data
x_test['chq'] = x_test.apply(lambda x: tokenize_and_remove_stopwords(x['chq']), axis=1)
x_test['faq'] = x_test.apply(lambda x: tokenize_and_remove_stopwords(x['faq']), axis=1)

# Test classifier
x_test['exists_both'] = [set(x[3]) & set(x[4]) for x in x_test.values]
x_test['exists_both_len'] = x_test['exists_both'].apply(len)
x_test['exist_feature'] = np.where(x_test['exists_both_len'] > 1, 1, 0)
pred_2 = x_test[x_test['exist_feature'] == x_test['outcome']]
pred_2.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stab

(1648, 9)

In [78]:
incorrect_pred_2 = x_test[x_test['exist_feature'] != x_test['outcome']]
incorrect_pred_2.shape

(70, 9)

In [79]:
print('Majority Classifier Test Accuracy:',916/(916+802))
print('Exact Match w/no stopwords > 1 Classifier Test Accuracy:',1648/(916+802))

Majority Classifier Test Accuracy: 0.5331781140861467
Exact Match w/no stopwords > 1 Classifier Test Accuracy: 0.959254947613504


Rule #2 (removing stopwords and tokenizing) further improved our test accuracy from ~93% -> ~95%

Look at incorrect predictions to further improve rules

In [80]:
incorrect_train_pred_2.head(20)

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
96,97,originalQ-shortRandQ,False,"[do, do, a, lead, level, in, month, old, is, b...","[should, treat, polymenorrhea, a, year, old, g...",0,"{a, old}",2,1
156,157,originalQ-shortRandQ,False,"[soon, you, ambulate, patient, a, deep, vein, ...","[i, use, low, molecular, weight, heparin, preg...",0,"{thrombosis, deep, vein, patient}",4,1
6885,6886,originalQ-shortRandQ,False,"[year, old, woman, complains, heart, feels, li...","[should, treat, polymenorrhea, a, year, old, g...",0,"{year, old}",2,1
6400,6401,originalQ-shortRandQ,False,"[guy, year, old, man, subclavian, steal, syndr...","[is, cause, treatment, this, old, man, stomati...",0,"{man, old}",2,1
8230,8231,originalQ-shortRandQ,False,"[is, differential, diagnosis, a, patient, cons...","[wonder, this, patient, could, a, rotator, cuf...",0,"{a, patient}",2,1
6395,6396,originalQ-shortQ,True,"[have, home, problem, children, one, the, kids...","[is, treatment, a, human, bite]",1,{human},1,0
333,334,originalQ-shortRandQ,False,"[s, crystal, clear, s, going, with, may, treat...","[should, treat, polymenorrhea, a, year, old, g...",0,"{year, a, old, treat}",4,1
7288,7289,originalQ-shortQ,True,"[month, old, intoeing, had, look, intoeing, we...","[is, approach, intoeing, children]",1,{intoeing},1,0
378,379,originalQ-shortRandQ,False,"[is, dose, imipramine, a, year, old, boy]","[should, treat, polymenorrhea, a, year, old, g...",0,"{year, a, old}",3,1
6715,6716,originalQ-shortQ,True,"[year, old, woman, complaining, excess, sweati...","[is, sweating]",1,{sweating},1,0


In [81]:
incorrect_train_pred_2['outcome'].value_counts()

0    183
1    105
Name: outcome, dtype: int64

In [82]:
incorrect_train_pred_2.exists_both = incorrect_train_pred_2.exists_both.apply(list)
incorrect_train_pred_2.exists_both = incorrect_train_pred_2.exists_both.apply(', '.join)
incorrect_train_pred_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
96,97,originalQ-shortRandQ,False,"[do, do, a, lead, level, in, month, old, is, b...","[should, treat, polymenorrhea, a, year, old, g...",0,"a, old",2,1
156,157,originalQ-shortRandQ,False,"[soon, you, ambulate, patient, a, deep, vein, ...","[i, use, low, molecular, weight, heparin, preg...",0,"thrombosis, deep, vein, patient",4,1
6885,6886,originalQ-shortRandQ,False,"[year, old, woman, complains, heart, feels, li...","[should, treat, polymenorrhea, a, year, old, g...",0,"year, old",2,1
6400,6401,originalQ-shortRandQ,False,"[guy, year, old, man, subclavian, steal, syndr...","[is, cause, treatment, this, old, man, stomati...",0,"man, old",2,1
8230,8231,originalQ-shortRandQ,False,"[is, differential, diagnosis, a, patient, cons...","[wonder, this, patient, could, a, rotator, cuf...",0,"a, patient",2,1


Now it seems we are hitting a point where the sentences are extremely similar, so we need to develop a more nuanced rule.

Most of our incorrect predictions for this classifier were false positives (predicted 1 but is actually 0), so we'll build our next rule to guard against that.

There seems to be words that are frequently found in both chq and faq, but don't have anything to do with the meaning (ex. a, old). Let's build a brief dictionary of these terms and see if we can exclude that in our classifier.


In [83]:
Counter(incorrect_train_pred_2.exists_both).most_common(100)

NameError: ignored

In [0]:
nonrel_words = ['year', 'old', 'a', 'years', 'patient', 'girl', 'i']

In [0]:
# Re-calculate common text feature
x_train['exists_both'] = [set(x[3]) & set(x[4]) for x in x_train.values]
# iterate over the dataframe row by row
for index_label, row_series in x_train.iterrows():
   # For each row update the exists_both variable
   for word in nonrel_words:
    if word in x_train.at[index_label, 'exists_both']:
      x_train.at[index_label , 'exists_both'].remove(word)
x_train['exists_both_len'] = x_train['exists_both'].apply(len)
x_train['exist_feature'] = np.where(x_train['exists_both_len'] > 1, 1, 0)

train_pred_3 = x_train[x_train['exist_feature'] == x_train['outcome']]
train_pred_3.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


(6703, 9)

In [0]:
incorrect_train_pred_3 = x_train[x_train['exist_feature'] != x_train['outcome']]
incorrect_train_pred_3.shape

(167, 9)

In [0]:
print('Majority Classifier Train Accuracy:',3739/(3739+3131))
print('Exact Match w/no stopwords or nonrelevant words > 1 Classifier Train Accuracy:',6703/(3739+3131))

Majority Classifier Train Accuracy: 0.5442503639010189
Exact Match w/no stopwords or nonrelevant words > 1 Classifier Train Accuracy: 0.9756914119359534


In [0]:
# Test classifier
# Re-calculate common text feature
x_test['exists_both'] = [set(x[3]) & set(x[4]) for x in x_test.values]
# iterate over the dataframe row by row
for index_label, row_series in x_test.iterrows():
   # For each row update the exists_both variable
   for word in nonrel_words:
    if word in x_test.at[index_label, 'exists_both']:
      x_test.at[index_label , 'exists_both'].remove(word)
x_test['exists_both'] = [set(x[3]) & set(x[4]) for x in x_test.values]
x_test['exists_both_len'] = x_test['exists_both'].apply(len)
x_test['exist_feature'] = np.where(x_test['exists_both_len'] > 1, 1, 0)
pred_3 = x_test[x_test['exist_feature'] == x_test['outcome']]
pred_3.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pyda

(1648, 9)

In [0]:
incorrect_pred_3 = x_test[x_test['exist_feature'] != x_test['outcome']]
incorrect_pred_3.shape

(70, 9)

In [0]:
print('Majority Classifier Test Accuracy:',916/(916+802))
print('Exact Match w/no stopwords or nonrelevant words > 1 Classifier Test Accuracy:',1648/(916+802))

Majority Classifier Test Accuracy: 0.5331781140861467
Exact Match w/no stopwords or nonrelevant words > 1 Classifier Test Accuracy: 0.959254947613504


Train accuracy improved from ~95% -> ~97%, but test accuracy was exact same. 

# Danny Witt Rule:

In [97]:
x_train.head()

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
6569,6570,originalQ-shortQ,True,"[year, old, woman, presented, yesterday, a, pe...","[a, patient, influenza, symptoms, treated, the...",1,"{influenza, a}",2,1
4205,4206,originalQ-shortRandQ,False,"[history, pulmonary, embolus, a, contraindicat...","[we, switch, ancef, this, diabetic, foot, ulce...",0,{},0,0
3889,3890,originalQ-shortQ,True,"[you, treat, year, old, a, positive, ppd, puri...","[you, treat, year, old, a, positive, ppd, puri...",1,"{year, negative, purified, old, protein, chest...",16,1
2357,2358,originalQ-shortRandQ,False,"[we, need, worry, thrombocytopenia, other, sid...","[do, inject, bicipital, tendon]",0,{},0,0
58,59,originalQ-shortQ,True,"[this, patient, tias, transient, ischemic, att...","[this, patient, transient, ischemic, attacks, ...",1,"{this, echocardiogram, patient, attacks, trans...",11,1


In [86]:
x_train.shape

(6870, 9)

In [87]:
x_test.head()

Unnamed: 0,pid,type,value,chq,faq,outcome,exists_both,exists_both_len,exist_feature
4956,4957,originalQ-shortRandQ,False,"[tinea, pedis, tinea, manus, fungal, skin, inf...","[should, treat, polymenorrhea, a, year, old, g...",0,{},0,0
1061,1062,originalQ-shortQ,True,"[test, culdocentesis, pelvic, ultrasound, woul...","[test, culdocentesis, pelvic, ultrasound, best...",1,"{culdocentesis, ultrasound, ovarian, test, cys...",7,1
5057,5058,originalQ-shortRandQ,False,"[nonsteroidal, anti, inflammatory, drugs, nsai...","[do, inject, bicipital, tendon]",0,{},0,0
222,223,originalQ-shortQ,True,"[patient, dr, x, saw, day, said, friend, told,...","[plendil, cause, adverse, reaction, sun, expos...",1,"{plendil, sun}",2,1
7519,7520,originalQ-shortQ,True,"[is, maximum, dose, zoloft]","[is, maximum, dose, zoloft]",1,"{maximum, is, zoloft, dose}",4,1


In [88]:
x_test.shape

(1718, 9)

In [89]:
y_train.head()

Unnamed: 0,outcome
6569,1
4205,0
3889,1
2357,0
58,1


In [92]:
y_train.shape

(6870, 1)

In [93]:
y_test.head()

Unnamed: 0,outcome
4956,0
1061,1
5057,0
222,1
7519,1


In [94]:
y_test.shape

(1718, 1)

Collecting SmoothingFunction
[31m  ERROR: Could not find a version that satisfies the requirement SmoothingFunction (from versions: none)[0m
[31mERROR: No matching distribution found for SmoothingFunction[0m


In [143]:
# Calculate new feature: BLEU score

def bleu(reference, predict):
    """Compute sentence-level bleu score.

    Args:
        reference (list[str])
        predict (list[str])
    """
    from nltk.translate import bleu_score, SmoothingFunction

    if len(predict) == 0:
        if len(reference) == 0:
            return 1.0
        else:
            return 0.0
          
    # use a maximum of 4-grams. If 4-grams aren't present, use only lower n-grams.
    n = min(4, len(reference), len(predict))
    weights = tuple([1. / n] * n)  # uniform weight on n-gram precisions
    smoothing_function = SmoothingFunction
    return bleu_score.sentence_bleu([reference], predict, weights, SmoothingFunction) 
  

x_train['bleu_score'] = x_train.apply(lambda x: bleu(x.chq, x.faq), axis=1)

x_train['bleu_feature'] = np.where(x_train['bleu_score'] > 0.05, 1, 0)

train_pred_new = x_train[x_train['bleu_feature'] == x_train['outcome']]

train_pred_new.shape

x_train.head(100)

ImportError: ignored

In [140]:
print('Majority Classifier Train Accuracy:',3739/(3739+3131))
num_correct = train_pred_new.shape[0]
print('Exact BLEU > 0.1 Classifier Train Accuracy:',num_correct/(3739+3131))

Majority Classifier Train Accuracy: 0.5442503639010189
Exact BLEU > 0.1 Classifier Train Accuracy: 0.8420669577874818
