# Thesis Code Part 2: Classic Machine Learning

## Import Packages

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

## Import Train and Test Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## Preprocess

In [None]:
import scispacy
import spacy
from negspacy.negation import Negex
from nltk.corpus import stopwords
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# SpaCy model for biomedical processing
nlp = spacy.load("en_core_sci_md")
nlp.add_pipe("negex")

In [None]:
# function from Chen & Sohn
# https://colab.research.google.com/drive/1jp8Oi2s13g2B34SPjX5074FDBlhmUdgn?usp=sharing#scrollTo=MIA9a7rckKil
def preprocess(nlp_model,input_text):
    input_text = input_text.strip()
    doc = nlp_model(input_text)
    negation_list = [0]*len(doc)
    tokens = list()
    stop = set(stopwords.words('english')+list(string.punctuation))
    stop.add("XXXX")
    
    for ent in doc.ents:
        if ent._.negex:
            index = ent.start
            while index < ent.end:
                negation_list[index] = 1
                index += 1
        
    for i,token in enumerate(doc):
        if str(token).lower() not in stop:
            if negation_list[i] == 1:
                tokens.append(("NEGEX_"+str(token).lower()))
            else:
                tokens.append(str(token).lower())

    return tokens

In [None]:
from gensim.corpora import Dictionary

train_text = train_df["Findings"].to_list()

def token_generator(text_list):
    for text in text_list:
        yield preprocess(nlp,text)

train_tokens = token_generator(train_text)
train_vocab_dict = Dictionary(train_tokens)

In [None]:
test_text = test_df["Findings"].to_list()
test_tokens = token_generator(test_text)

## Naive Bayes

In [None]:
def sparse_vector_create(tuple_list,vocab_len):
    #tuple_list will have data structure akin to gensim dictionary doc2bow output 
    sparse_vector = np.zeros(vocab_len)
    for id,freq in tuple_list:
        sparse_vector[id] = freq
    return sparse_vector

def sparse_vector_generator(tokens,vocab_dict,vocab_len):
    for token in tokens:
        yield sparse_vector_create(vocab_dict.doc2bow(token),vocab_len)

# recreate generator object to reset it , otherwise will output empty result
train_tokens = token_generator(train_text)
x_train_sparse = [sparse_vector for sparse_vector in sparse_vector_generator(train_tokens,train_vocab_dict,len(train_vocab_dict))]
y_train = train_df['label'].to_list()
y_test = test_df['label'].to_list()

In [None]:
from sklearn import naive_bayes

nb_classifier = naive_bayes.MultinomialNB(alpha=1.0)

nb_classifier.fit(x_train_sparse,y_train)

In [None]:
test_tokens = token_generator(test_text)
nb_predictions = list()

for token in test_tokens:
    test_sparse_vector = sparse_vector_create(train_vocab_dict.doc2bow(token),len(train_vocab_dict))
    nb_predictions.append(nb_classifier.predict(test_sparse_vector.reshape(1,-1))[0])

## Modeling - Support Vector Machine (SVM)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, smooth_idf = True,sublinear_tf = True)
tfidf.fit(x_train_sparse)
x_train_tfidf= tfidf.transform(x_train_sparse)

In [None]:
from sklearn import svm

SVM = svm.SVC(C=1.0,kernel ='linear')
SVM.fit(x_train_tfidf,y_train)

In [None]:
test_tokens = token_generator(test_text)
svm_predictions = list()

for token in test_tokens:
    test_sparse_vector = sparse_vector_create(train_vocab_dict.doc2bow(token),len(train_vocab_dict))
    x_test_tfidf = tfidf.transform(test_sparse_vector.reshape(1,-1))
    svm_predictions.append(SVM.predict(x_test_tfidf)[0])

## Evaluation

In [None]:
from sklearn import metrics

class metric_calc:
    def __init__(self, y,y_hat):
        # y is true label, y_hat is predicted label
        self.y_hat = y_hat
        self.y = y
        conf_matrix = metrics.confusion_matrix(y,y_hat)
    
        self.true_neg = conf_matrix[0][0]
        self.false_pos = conf_matrix[0][1]
        self.false_neg = conf_matrix[1][0]
        self.true_pos = conf_matrix[1][1]

    def conf_matrix_values(self):
        print(f"TN: {self.true_neg}, FP: {self.false_pos}, FN: {self.false_neg} TP: {self.true_pos}")

    def sens_spec(self):
        sens = self.true_pos/(self.true_pos+self.false_neg)
        spec = self.true_neg/(self.true_neg+self.false_pos)
        print(f"Sensitivity (aka recall) is {sens:.4f}")
        print(f"Specificity is {spec:.4f}")
        
        precision = self.true_pos/(self.true_pos+self.false_pos)
        recall = self.true_pos/(self.true_pos+self.false_neg)
        f1 = 2*(precision*recall)/(precision+recall)
        print(f"F1-Score is {f1:.4f}")

    def incorrect_index(self):
        # return indices of examples that are incorrectly predicted
        index = list()
        for i,label in enumerate(self.y):
            if label != self.y_hat[i]:
                index.append(i)
        return index

In [None]:
print("Naive Bayes")
nb_metric = metric_calc(y_test,nb_predictions)
nb_metric.conf_matrix_values()
nb_metric.sens_spec()

print("SVM")
svm_metric = metric_calc(y_test,svm_predictions)
svm_metric.conf_matrix_values()
svm_metric.sens_spec()