<a href="https://colab.research.google.com/github/cornelius152/Text-Classification---Data-Science-Internship/blob/main/Bio_ClinicalBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries & Packages**

In [1]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [3]:
# Importing global libraries
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
%matplotlib inline

# Import language libraries and packages
import nltk
nltk.download('punkt') # A pre-trained unsupervised machine learning model for tokenizing text into individual words or sentences
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4') # Open Multilingual WordNet (OMW)

from nltk.corpus import stopwords
nltk.download('stopwords') # For 'English' stopwords

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') # For lemmatization 

pd.options.mode.chained_assignment = None  # default='warn' and I am disabling it

import warnings
warnings.filterwarnings('ignore', message='.*', category=UserWarning, module='sklearn') # To suppress precision score warning

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# **Load Bert Models**

In [4]:
# Load Bio_ClinicalBERT pre-trained word embeddings
biob_model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.get_input_embeddings() returns the input embeddings of the BERT model.

.weight accesses the weights of the input embeddings, which are the word embeddings that have been learned by the model.

.detach() detaches the tensor so it can be used as a standalone tensor.

.cpu() moves the tensor from the GPU to the CPU, so that it can be used by the numpy library.

.numpy() converts the tensor to a numpy array, which is the format that the gensim library expects for the initial weights of word2vec

In [5]:
# Grab pre-trained word embeddings
biob_word_vectors = biob_model.get_input_embeddings().weight.detach().cpu().numpy()

In [6]:
# Looking at shape of vectors
print(biob_word_vectors.shape)

(28996, 768)


## Loading Tokenizer

In [7]:
# Bio_ClinicalBERT Tokenizer
bio_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Load Data**

In [9]:
# Import files

# Unclassified (uc) data
uc_data = pd.read_csv('/content/drive/MyDrive/OneHealth Data Science Internship/Colab Notebooks/Data copy/unique_unclassified_questions.csv')
uc_data = uc_data.rename(columns={'question': 'Question'}) # Rename question column to match the question columns name in other datasets

# Classified (c) data
c_data = pd.read_excel('/content/drive/MyDrive/OneHealth Data Science Internship/Colab Notebooks/Data copy/categorized_questions.xlsx')

# CDC dataset
cdc_data = pd.read_csv('/content/drive/MyDrive/OneHealth Data Science Internship/Colab Notebooks/Data copy/Behavioral_Risk_Factor_Surveillance_System__BRFSS__Historical_Questions.csv')

# All classified data (c_data + cdc_data)
class_data = pd.concat([c_data, cdc_data])

# **Processor Function**

In [10]:
# Processor function to do everything in one go

# Create stopword variable holding all stopwords in English
stopwords = stopwords.words('english')

# Create Porter- and Lancaster- stemmer objects
porter = PorterStemmer()
lancaster = LancasterStemmer()

# Create lemmatizer object
wordnet_lemmatizer = WordNetLemmatizer()

def processor(data, feature, tokens='tokens'):
    
    # Drop duplicate rows based on a specified feature, but keep first instance of duplicate and reset index
    data = data.drop_duplicates(subset=feature, keep='first', ignore_index=True)

    # Tokenize feature of textual data, and create new column with tokenized text using BioBERT tokenizer
    data['bio_'+tokens] = data[feature].apply(lambda x: bio_tokenizer.tokenize(x, add_special_tokens=True))

    # Convert tokens into ids using BioBERT tokenizer
    data['bio_ids_'+tokens] = data['bio_'+tokens].apply(lambda x: bio_tokenizer.convert_tokens_to_ids(x))

    # Lowercasing and removing stop words, and making sure all textual data is alphanumeric for BioBERT
    data[tokens] = data['bio_'+tokens].apply(lambda x: 
                                      [word.lower() for word in x if word not in (stopwords) and word.isalnum()])
    
    # Remove empty lists
    data = data[data[tokens].map(len) > 0]

    # Processing tokens into string and creating a separate column of this for later use
    data['str tokens'] = data[tokens].apply(lambda x: ' '.join(x))
        
    # Applying bigrams
    data['bigrams'] = data[tokens].apply(lambda x: list(nltk.bigrams(x)))

    # Applying porter stem
    data['porter stem'] = data[tokens].apply(lambda x: [porter.stem(word) for word in x])
    
    # Applying lancaster stem
    data['lancaster stem'] = data[tokens].apply(lambda x: [lancaster.stem(word) for word in x])
    
    # Lemmatization
    data['lemma'] = data[tokens].apply(lambda x: [wordnet_lemmatizer.lemmatize(word) for word in x])
    
    # Pos-tagging
    data['postag'] = data[tokens].apply(lambda x: nltk.pos_tag(x))
    
    return data

### Tokens Processed

In [11]:
# Applying processor function too the uc, c, and cdc datasets, and only grabbing the TOKENS feature
tokens_uc_data = pd.DataFrame(processor(uc_data, 'Question')['tokens'])
tokens_c_data = pd.DataFrame(processor(c_data, 'Question')['tokens'])
tokens_cdc_data = pd.DataFrame(processor(cdc_data, 'Question')['tokens'])

# Data in index 1, axis 0 for uc dataset was an empty list, so manually dropping it
tokens_uc_data = tokens_uc_data.drop(index=1)

# Concatenating all three dataframes into one
tokens_all_data = pd.concat([tokens_uc_data, tokens_c_data, tokens_cdc_data])

# Concatenating classified datasets: classified OH and CDC datasets
tokens_classified_data = pd.concat([tokens_c_data, tokens_cdc_data])

In [12]:
tokens_c_data.head(3)

Unnamed: 0,tokens
0,"[age, group]"
1,[gender]
2,"[thinking, discomfort, consuming, wheat, whole..."


### Bigrams Processed

In [13]:
# Applying processor function too the uc, c, and cdc datasets, and only grabbing the Bigrams feature
bigram_uc_data = pd.DataFrame(processor(uc_data, 'Question')['bigrams'])
bigram_c_data = pd.DataFrame(processor(c_data, 'Question')['bigrams'])
bigram_cdc_data = pd.DataFrame(processor(cdc_data, 'Question')['bigrams'])

# Dropping empty lists in dataframes
bigram_uc_data_2 = bigram_uc_data[bigram_uc_data['bigrams'].map(len) > 0]
bigram_c_data_2 = bigram_c_data[bigram_c_data['bigrams'].map(len) > 0]
bigram_cdc_data_2 = bigram_cdc_data[bigram_cdc_data['bigrams'].map(len) > 0]

# Concatenating classified datasets: classified OH and CDC datasets
bigram_classified_data = pd.concat([bigram_c_data_2, bigram_cdc_data_2])

In [14]:
bigram_c_data_2.head(3)

Unnamed: 0,bigrams
0,"[(age, group)]"
2,"[(thinking, discomfort), (discomfort, consumin..."
3,"[(think, diagnosed), (diagnosed, doctor), (doc..."


### Lemma Processed


In [15]:
# Applying processor function too the uc, c, and cdc datasets, and only grabbing the Lemma feature
lemma_uc_data = pd.DataFrame(processor(uc_data, 'Question')['lemma'])
lemma_c_data = pd.DataFrame(processor(c_data, 'Question')['lemma'])
lemma_cdc_data = pd.DataFrame(processor(cdc_data, 'Question')['lemma'])

# Data in index 1, axis 0 for uc dataset was an empty list, so manually dropping it
lemma_uc_data = lemma_uc_data.drop(index=1)

# Concatenating classified datasets: classified OH and CDC datasets
lemma_classified_data = pd.concat([lemma_c_data, lemma_cdc_data])

In [16]:
lemma_c_data.head(3)

Unnamed: 0,lemma
0,"[age, group]"
1,[gender]
2,"[thinking, discomfort, consuming, wheat, whole..."


### Porter Processed

In [17]:
# Applying processor function too the uc, c, and cdc datasets, and only grabbing the Porter Stem feature
porter_uc_data = pd.DataFrame(processor(uc_data, 'Question')['porter stem'])
porter_c_data = pd.DataFrame(processor(c_data, 'Question')['porter stem'])
porter_cdc_data = pd.DataFrame(processor(cdc_data, 'Question')['porter stem'])

# Data in index 1, axis 0 for uc dataset was an empty list, so manually dropping it
porter_uc_data = porter_uc_data.drop(index=1)

# Concatenating classified datasets: classified OH and CDC datasets
porter_classified_data = pd.concat([porter_c_data, porter_cdc_data])

In [18]:
porter_c_data.head(3)

Unnamed: 0,porter stem
0,"[age, group]"
1,[gender]
2,"[think, discomfort, consum, wheat, whole, grai..."


### Lancaster Processed

In [19]:
# Applying processor function too the uc, c, and cdc datasets, and only grabbing the Lancaster Stem feature
lancaster_uc_data = pd.DataFrame(processor(uc_data, 'Question')['lancaster stem'])
lancaster_c_data = pd.DataFrame(processor(c_data, 'Question')['lancaster stem'])
lancaster_cdc_data = pd.DataFrame(processor(cdc_data, 'Question')['lancaster stem'])

# Data in index 1, axis 0 for uc dataset was an empty list, so manually dropping it
lancaster_uc_data = lancaster_uc_data.drop(index=1)

# Concatenating classified datasets: classified OH and CDC datasets
lancaster_classified_data = pd.concat([lancaster_c_data, lancaster_cdc_data])

In [20]:
lancaster_c_data.head(3)

Unnamed: 0,lancaster stem
0,"[ag, group]"
1,[gend]
2,"[think, discomfort, consum, whe, whol, grain, ..."


### Postag Processed

In [21]:
# Applying processor function too the uc, c, and cdc datasets, and only grabbing the Postag feature
postag_uc_data = pd.DataFrame(processor(uc_data, 'Question')['postag'])
postag_c_data = pd.DataFrame(processor(c_data, 'Question')['postag'])
postag_cdc_data = pd.DataFrame(processor(cdc_data, 'Question')['postag'])

# Data in index 1, axis 0 for uc dataset was an empty list, so manually dropping it
postag_uc_data = postag_uc_data.drop(index=1)

# Concatenating classified datasets: classified OH and CDC datasets
postag_classified_data = pd.concat([postag_c_data, postag_cdc_data])

In [22]:
postag_c_data.head(3)

Unnamed: 0,postag
0,"[(age, NN), (group, NN)]"
1,"[(gender, NN)]"
2,"[(thinking, VBG), (discomfort, NN), (consuming..."


# **List Builder**

In [23]:
# Function to create one long list of separate sentences that are all of type string so that I can use them as inputs for my model

# Function takes in two parameters: data as a dataframe; feature as the column of the dataframe
def list_builder(data, feature):
    data = data[feature] # Variable to hold the specific feature of the dataframe
    data_words = [] # Creating two empty lists to hold the words and sentences from the previous dataframe
    data_sents = []

    for lists in data: # Running a for loop through each row of the column of dataframe
        if len(lists) == 0: # Skip empty lists
            continue
        elif isinstance(lists[0], tuple):  # Check if list is a list of tuples - mainly doing this for postags
            words = ['_'.join(word) for word in lists]  # Join bigrams with a space
        else:
            words = lists  # Else, just use the list as is
        string = ' '.join(words) # Each row contains a list of tokens; so here I am joining words in the list and building one string, separated by a blank space (' ')
        data_words.append(string) # Appending the words to the empty data_words list

    data_sents = [word for word in data_words if word] # Going through each sentence in data_words, making sure it is a word, and appending that to the empty data_sents list
    
    return data_sents

## Tokens as List

In [24]:
# c_data
tokens_c_list = list_builder(tokens_c_data, 'tokens')

# uc_data
tokens_uc_list = list_builder(tokens_uc_data, 'tokens')

# cdc_data
tokens_cdc_list = list_builder(tokens_cdc_data, 'tokens')

# classified_data
tokens_classified_list = list_builder(tokens_classified_data, 'tokens')

## Bigrams as List

In [25]:
# c_data
bigram_c_list = list_builder(bigram_c_data, 'bigrams')

# uc_data
bigram_uc_list = list_builder(bigram_uc_data, 'bigrams')

# cdc_data
bigram_cdc_list = list_builder(bigram_cdc_data, 'bigrams')

# classified_data
bigram_classified_list = list_builder(bigram_classified_data, 'bigrams')

## Lemma as List

In [26]:
# c_data
lemma_c_list = list_builder(lemma_c_data, 'lemma')

# uc_data
lemma_uc_list = list_builder(lemma_uc_data, 'lemma')

# cdc_data
lemma_cdc_list = list_builder(lemma_cdc_data, 'lemma')

# classified_data
lemma_classified_list = list_builder(lemma_classified_data, 'lemma')

## Porter as List

In [27]:
# c_data
porter_c_list = list_builder(porter_c_data, 'porter stem')

# uc_data
porter_uc_list = list_builder(porter_uc_data, 'porter stem')

# cdc_data
porter_cdc_list = list_builder(porter_cdc_data, 'porter stem')

# classified_data
porter_classified_list = list_builder(porter_classified_data, 'porter stem')

## Lancaster as List

In [28]:
# c_data
lancaster_c_list = list_builder(lancaster_c_data, 'lancaster stem')

# uc_data
lancaster_uc_list = list_builder(lancaster_uc_data, 'lancaster stem')

# cdc_data
lancaster_cdc_list = list_builder(lancaster_cdc_data, 'lancaster stem')

# classified_data
lancaster_classified_list = list_builder(lancaster_classified_data, 'lancaster stem')

## Postag as List

In [29]:
# c_data
postag_c_list = list_builder(postag_c_data, 'postag')

# uc_data
postag_uc_list = list_builder(postag_uc_data, 'postag')

# cdc_data
postag_cdc_list = list_builder(postag_cdc_data, 'postag')

# classified_data
postag_classified_list = list_builder(postag_classified_data, 'postag')

# **BERT Embeddings**

In [30]:
# Function to build BERT Embeddings

def bert_embeddings(data_list, bio_tokenizer=bio_tokenizer, biob_model=biob_model, max_length=100):
    # Encode inputs ---------------------
    
    # Encode the tokenized questions using Bio BERT tokenizer
    bio_encoded_inputs = bio_tokenizer(data_list, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

    # Extract output (hidden states and final representation) ---------------------

    # Pass the encoded inputs through the Bio BERT model
    with torch.no_grad():
        bio_model_output = biob_model(**bio_encoded_inputs)

    # Extract sentence-level embeddings ---------------------

    # Get the Bio BERT sentence-level embeddings (CLS (classification) token)
    bio_embeddings = bio_model_output.last_hidden_state[:, 0, :].numpy()

    # Convert embeddings to a list to feed to ML models
    bio_embeddings_list = bio_embeddings.tolist()

    return bio_embeddings_list

model_output.last_hidden_state is a tensor of shape (batch_size, sequence_length, hidden_size), where batch_size is the number of input sequences, sequence_length is the length of the longest input sequence, and hidden_size is the size of the output embeddings.

[:, 0, :] selects the first token of each input sequence from the output tensor, which corresponds to the [CLS] token. The [CLS] token is a special token added at the beginning of the input sequence and its corresponding output vector is often used as a summary or representation of the entire input sequence.

## Tokens BERT Embeddings

In [31]:
# c_data embeddings
tokens_c_biob_embeddings = bert_embeddings(tokens_c_list)

# uc_data embeddings
tokens_uc_biob_embeddings = bert_embeddings(tokens_uc_list)

# cdc_data embeddings
tokens_cdc_biob_embeddings = bert_embeddings(tokens_cdc_list)

# class_data embeddings
tokens_classified_biob_embeddings = bert_embeddings(tokens_classified_list)

## Bigrams BERT Embeddings

In [32]:
# c_data embeddings
bigram_c_biob_embeddings = bert_embeddings(bigram_c_list)

# uc_data embeddings
bigram_uc_biob_embeddings = bert_embeddings(bigram_uc_list)

# cdc_data embeddings
bigram_cdc_biob_embeddings = bert_embeddings(bigram_cdc_list)

# class_data embeddings
bigram_classified_biob_embeddings = bert_embeddings(bigram_classified_list)

## Lemma BERT Embeddings

In [33]:
# c_data embeddings
lemma_c_biob_embeddings = bert_embeddings(lemma_c_list)

# uc_data embeddings
lemma_uc_biob_embeddings = bert_embeddings(lemma_uc_list)

# cdc_data embeddings
lemma_cdc_biob_embeddings = bert_embeddings(lemma_cdc_list)

# class_data embeddings
lemma_classified_biob_embeddings = bert_embeddings(lemma_classified_list)

## Porter BERT Embeddings

In [34]:
# c_data embeddings
porter_c_biob_embeddings = bert_embeddings(porter_c_list)

# uc_data embeddings
porter_uc_biob_embeddings = bert_embeddings(porter_uc_list)

# cdc_data embeddings
porter_cdc_biob_embeddings = bert_embeddings(porter_cdc_list)

# class_data embeddings
porter_classified_biob_embeddings = bert_embeddings(porter_classified_list)

## Lancaster BERT Embeddings

In [35]:
# c_data embeddings
lancaster_c_biob_embeddings = bert_embeddings(lancaster_c_list)

# uc_data embeddings
lancaster_uc_biob_embeddings = bert_embeddings(lancaster_uc_list)

# cdc_data embeddings
lancaster_cdc_biob_embeddings = bert_embeddings(lancaster_cdc_list)

# class_data embeddings
lancaster_classified_biob_embeddings = bert_embeddings(lancaster_classified_list)

## Postag BERT Embeddings

In [36]:
# c_data embeddings
postag_c_biob_embeddings = bert_embeddings(postag_c_list)

# uc_data embeddings
postag_uc_biob_embeddings = bert_embeddings(postag_uc_list)

# cdc_data embeddings
postag_cdc_biob_embeddings = bert_embeddings(postag_cdc_list)

# class_data embeddings
postag_classified_biob_embeddings = bert_embeddings(postag_classified_list)

## Split Function for All Embeddings

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder # # Calling labelencoder to encode truth labels from string to numerical values

In [38]:
# Building train-test split function

def split(embeddings_data, original_data, test_size=0.2, random_state=42):
    # Encoding labels ----------------,

    # Create label encoder object
    label_encoder = LabelEncoder()

    # Grabbing only topic labels
    topics_df = processor(original_data, 'Question')['Topic']

    # Encoding topic labels from classified dataset
    topics_encoded = label_encoder.fit_transform(topics_df)

    # Convert each list to a numpy array of fixed length and pad
    max_len = max(len(lst) for lst in embeddings_data)
    x_padded = [np.pad(lst, (0, max_len - len(lst)), 'constant') for lst in embeddings_data]

    # Stack into 2D numpy array
    x_arr = np.vstack(x_padded)

    # Creating y variable
    y = topics_encoded
    y_arr = np.array(y)

    # Perform train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_arr, y_arr, test_size=test_size, random_state=random_state)

    # Return the split data
    return x_train, x_test, y_train, y_test

## Tokens Split

In [39]:
# Classified OH data
x_train_tokens_c_bio, x_test_tokens_c_bio, y_train_tokens_c_bio, y_test_tokens_c_bio = split(tokens_c_biob_embeddings, c_data)

# CDC data
x_train_tokens_cdc_bio, x_test_tokens_cdc_bio, y_train_tokens_cdc_bio, y_test_tokens_cdc_bio = split(tokens_cdc_biob_embeddings, cdc_data)

# All classified data
x_train_tokens_class_bio, x_test_tokens_class_bio, y_train_tokens_class_bio, y_test_tokens_class_bio = split(tokens_classified_biob_embeddings, class_data)

## Bigram Split

In [40]:
# Classified OH data
# x_train_tokens_c_bio, x_test_tokens_c_bio, y_train_tokens_c_bio, y_test_tokens_c_bio = split(bigram_c_biob_embeddings, c_data)

# CDC data
# x_train_tokens_cdc_bio, x_test_tokens_cdc_bio, y_train_tokens_cdc_bio, y_test_tokens_cdc_bio = split(bigram_cdc_biob_embeddings, cdc_data)

# All classified data
# x_train_tokens_class_bio, x_test_tokens_class_bio, y_train_tokens_class_bio, y_test_tokens_class_bio = split(bigram_classified_biob_embeddings, class_data)


## Lemma Split

In [41]:
# Classified OH data
x_train_lemma_c_bio, x_test_lemma_c_bio, y_train_lemma_c_bio, y_test_lemma_c_bio = split(lemma_c_biob_embeddings, c_data)

# CDC data
x_train_lemma_cdc_bio, x_test_lemma_cdc_bio, y_train_lemma_cdc_bio, y_test_lemma_cdc_bio = split(lemma_cdc_biob_embeddings, cdc_data)

# All classified data
x_train_lemma_class_bio, x_test_lemma_class_bio, y_train_lemma_class_bio, y_test_lemma_class_bio = split(lemma_classified_biob_embeddings, class_data)

## Porter Split

In [42]:
# Classified OH data
x_train_porter_c_bio, x_test_porter_c_bio, y_train_porter_c_bio, y_test_porter_c_bio = split(porter_c_biob_embeddings, c_data)

# CDC data
x_train_porter_cdc_bio, x_test_porter_cdc_bio, y_train_porter_cdc_bio, y_test_porter_cdc_bio = split(porter_cdc_biob_embeddings, cdc_data)

# All classified data
x_train_porter_class_bio, x_test_porter_class_bio, y_train_porter_class_bio, y_test_porter_class_bio = split(porter_classified_biob_embeddings, class_data)

## Postag Split

In [43]:
# Classified OH data
x_train_lancaster_c_bio, x_test_lancaster_c_bio, y_train_lancaster_c_bio, y_test_lancaster_c_bio = split(lancaster_c_biob_embeddings, c_data)

# CDC data
x_train_lancaster_cdc_bio, x_test_lancaster_cdc_bio, y_train_lancaster_cdc_bio, y_test_lancaster_cdc_bio = split(lancaster_cdc_biob_embeddings, cdc_data)

# All classified data
x_train_lancaster_class_bio, x_test_lancaster_class_bio, y_train_lancaster_class_bio, y_test_lancaster_class_bio = split(lancaster_classified_biob_embeddings, class_data)

## Postag Split

In [44]:
# Classified OH data
x_train_postag_c_bio, x_test_postag_c_bio, y_train_postag_c_bio, y_test_postag_c_bio = split(postag_c_biob_embeddings, c_data)

# CDC data
x_train_postag_cdc_bio, x_test_postag_cdc_bio, y_train_postag_cdc_bio, y_test_postag_cdc_bio = split(postag_cdc_biob_embeddings, cdc_data)

# All classified data
x_train_postag_class_bio, x_test_postag_class_bio, y_train_postag_class_bio, y_test_postag_class_bio = split(postag_classified_biob_embeddings, class_data)

# **Supervised Learning**

# **Linear Regression**

In [45]:
# Linear Regression
from sklearn.linear_model import LinearRegression

# Evaluation metrics
from sklearn.metrics import r2_score, mean_absolute_error

In [46]:
# Building Linear Regression function
def linear_regression(x_train, x_test, y_train, y_test):
    # Create linear regression object and fit model
    model = LinearRegression().fit(x_train, y_train)

    # Prediction
    y_pred = model.predict(x_test)

    # Calculate R2 and MAE
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f' r2 value: {r2}')
    print(f'mae value: {mae}')

## Tokens Linear Regression

In [47]:
# Classified OH data
print('Linear Regression evaluation scores for c_data -------')
linear_regression(x_train_tokens_c_bio, x_test_tokens_c_bio, y_train_tokens_c_bio, y_test_tokens_c_bio)
print()

# CDC data
print('Linear Regression evaluation scores for cdc_data -------')
linear_regression(x_train_tokens_cdc_bio, x_test_tokens_cdc_bio, y_train_tokens_cdc_bio, y_test_tokens_cdc_bio)
print()

# Classified OH data
print('Linear Regression evaluation scores for class_data -------')
linear_regression(x_train_tokens_class_bio, x_test_tokens_class_bio, y_train_tokens_class_bio, y_test_tokens_class_bio)

Linear Regression evaluation scores for c_data -------
 r2 value: 0.42877898985481744
mae value: 2.7099050351982394

Linear Regression evaluation scores for cdc_data -------
 r2 value: -0.3999210446642243
mae value: 35.70573328435985

Linear Regression evaluation scores for class_data -------
 r2 value: -0.3231079953702565
mae value: 39.550417052535614


## Lemma Linear Regression

In [48]:
# Classified OH data
print('Linear Regression evaluation scores for c_data -------')
linear_regression(x_train_lemma_c_bio, x_test_lemma_c_bio, y_train_lemma_c_bio, y_test_lemma_c_bio)
print()

# CDC data
print('Linear Regression evaluation scores for cdc_data -------')
linear_regression(x_train_lemma_cdc_bio, x_test_lemma_cdc_bio, y_train_lemma_cdc_bio, y_test_lemma_cdc_bio)
print()

# Classified OH data
print('Linear Regression evaluation scores for class_data -------')
linear_regression(x_train_lemma_class_bio, x_test_lemma_class_bio, y_train_lemma_class_bio, y_test_lemma_class_bio)

Linear Regression evaluation scores for c_data -------
 r2 value: 0.4193519880539943
mae value: 2.624288453906208

Linear Regression evaluation scores for cdc_data -------
 r2 value: -0.4978639978806423
mae value: 36.83823505138809

Linear Regression evaluation scores for class_data -------
 r2 value: -0.3152948629094712
mae value: 40.07003395190212


## Porter Linear Regression

In [49]:
# Classified OH data
print('Linear Regression evaluation scores for c_data -------')
linear_regression(x_train_porter_c_bio, x_test_porter_c_bio, y_train_porter_c_bio, y_test_porter_c_bio)
print()

# CDC data
print('Linear Regression evaluation scores for cdc_data -------')
linear_regression(x_train_porter_cdc_bio, x_test_porter_cdc_bio, y_train_porter_cdc_bio, y_test_porter_cdc_bio)
print()

# Classified OH data
print('Linear Regression evaluation scores for class_data -------')
linear_regression(x_train_porter_class_bio, x_test_porter_class_bio, y_train_porter_class_bio, y_test_porter_class_bio)

Linear Regression evaluation scores for c_data -------
 r2 value: 0.429027922426093
mae value: 2.4194163989741897

Linear Regression evaluation scores for cdc_data -------
 r2 value: -0.7051310098558696
mae value: 39.57697949407792

Linear Regression evaluation scores for class_data -------
 r2 value: -0.38311769891404546
mae value: 41.33928699798512


## Lancaster Linear Regression

In [50]:
# Classified OH data
print('Linear Regression evaluation scores for c_data -------')
linear_regression(x_train_lancaster_c_bio, x_test_lancaster_c_bio, y_train_lancaster_c_bio, y_test_lancaster_c_bio)
print()

# CDC data
print('Linear Regression evaluation scores for cdc_data -------')
linear_regression(x_train_lancaster_cdc_bio, x_test_lancaster_cdc_bio, y_train_lancaster_cdc_bio, y_test_lancaster_cdc_bio)
print()

# Classified OH data
print('Linear Regression evaluation scores for class_data -------')
linear_regression(x_train_lancaster_class_bio, x_test_lancaster_class_bio, y_train_lancaster_class_bio, y_test_lancaster_class_bio)

Linear Regression evaluation scores for c_data -------
 r2 value: 0.5764312521949913
mae value: 2.384223913467877

Linear Regression evaluation scores for cdc_data -------
 r2 value: -0.3327440818121363
mae value: 35.617635538177844

Linear Regression evaluation scores for class_data -------
 r2 value: -0.08838811244002232
mae value: 36.844201725153646


## Postag Linear Regression

In [51]:
# Classified OH data
print('Linear Regression evaluation scores for c_data -------')
linear_regression(x_train_postag_c_bio, x_test_postag_c_bio, y_train_postag_c_bio, y_test_postag_c_bio)
print()

# CDC data
print('Linear Regression evaluation scores for cdc_data -------')
linear_regression(x_train_postag_cdc_bio, x_test_postag_cdc_bio, y_train_postag_cdc_bio, y_test_postag_cdc_bio)
print()

# Classified OH data
print('Linear Regression evaluation scores for class_data -------')
linear_regression(x_train_postag_class_bio, x_test_postag_class_bio, y_train_postag_class_bio, y_test_postag_class_bio)

Linear Regression evaluation scores for c_data -------
 r2 value: 0.5345626004768232
mae value: 2.3294621223423304

Linear Regression evaluation scores for cdc_data -------
 r2 value: -1.1999738502850996
mae value: 44.3858032902745

Linear Regression evaluation scores for class_data -------
 r2 value: -0.6445503775176071
mae value: 45.71421070894695


# **Logistic Regression**

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [56]:
# Building logistic regression function

def logistic_regression(x_train, x_test, y_train, y_test):
    # Create logistic regression object and fit model    
    lr_model = LogisticRegression(random_state=42, max_iter=10000).fit(x_train, y_train)

    # Make predictions
    y_pred = lr_model.predict(x_test)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    # average='macro' as our data is imbalanced and it ensures that the performance of the model on each class is given equal weight in score calculation
    f1 = f1_score(y_test, np.round(y_pred), average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')

    print(f'Accuracy: {accuracy * 100.0:.2f}%')
    print(f'F1: {f1 * 100.0:.2f}%')
    print(f'precision: {precision * 100.0:.2f}%')
    print(f'Recall: {precision * 100.0:.2f}%')

## Tokens Logistic Regression

In [57]:
# Classified OH data
print('logistic Regression evaluation scores for c_data -------')
logistic_regression(x_train_tokens_c_bio, x_test_tokens_c_bio, y_train_tokens_c_bio, y_test_tokens_c_bio)
print()

# CDC data
print('logistic Regression evaluation scores for cdc_data -------')
logistic_regression(x_train_tokens_cdc_bio, x_test_tokens_cdc_bio, y_train_tokens_cdc_bio, y_test_tokens_cdc_bio)
print()

# Classified OH data
print('logistic Regression evaluation scores for class_data -------')
logistic_regression(x_train_tokens_class_bio, x_test_tokens_class_bio, y_train_tokens_class_bio, y_test_tokens_class_bio)

logistic Regression evaluation scores for c_data -------
Accuracy: 53.85%
F1: 30.21%
precision: 31.25%
Recall: 31.25%

logistic Regression evaluation scores for cdc_data -------
Accuracy: 63.64%
F1: 42.15%
precision: 44.08%
Recall: 44.08%

logistic Regression evaluation scores for class_data -------
Accuracy: 66.08%
F1: 44.69%
precision: 47.64%
Recall: 47.64%


## Lemma Logistic Regression

In [58]:
# Classified OH data
print('logistic Regression evaluation scores for c_data -------')
logistic_regression(x_train_lemma_c_bio, x_test_lemma_c_bio, y_train_lemma_c_bio, y_test_lemma_c_bio)
print()

# CDC data
print('logistic Regression evaluation scores for cdc_data -------')
logistic_regression(x_train_lemma_cdc_bio, x_test_lemma_cdc_bio, y_train_lemma_cdc_bio, y_test_lemma_cdc_bio)
print()

# Classified OH data
print('logistic Regression evaluation scores for class_data -------')
logistic_regression(x_train_lemma_class_bio, x_test_lemma_class_bio, y_train_lemma_class_bio, y_test_lemma_class_bio)

logistic Regression evaluation scores for c_data -------
Accuracy: 53.85%
F1: 30.21%
precision: 31.25%
Recall: 31.25%

logistic Regression evaluation scores for cdc_data -------
Accuracy: 63.33%
F1: 41.97%
precision: 43.55%
Recall: 43.55%

logistic Regression evaluation scores for class_data -------
Accuracy: 66.67%
F1: 44.30%
precision: 47.88%
Recall: 47.88%


## Porter Logistic Regression

In [59]:
# Classified OH data
print('logistic Regression evaluation scores for c_data -------')
logistic_regression(x_train_porter_c_bio, x_test_porter_c_bio, y_train_porter_c_bio, y_test_porter_c_bio)
print()

# CDC data
print('logistic Regression evaluation scores for cdc_data -------')
logistic_regression(x_train_porter_cdc_bio, x_test_porter_cdc_bio, y_train_porter_cdc_bio, y_test_porter_cdc_bio)
print()

# Classified OH data
print('logistic Regression evaluation scores for class_data -------')
logistic_regression(x_train_porter_class_bio, x_test_porter_class_bio, y_train_porter_class_bio, y_test_porter_class_bio)

logistic Regression evaluation scores for c_data -------
Accuracy: 53.85%
F1: 26.07%
precision: 27.50%
Recall: 27.50%

logistic Regression evaluation scores for cdc_data -------
Accuracy: 64.24%
F1: 42.37%
precision: 43.39%
Recall: 43.39%

logistic Regression evaluation scores for class_data -------
Accuracy: 64.04%
F1: 41.68%
precision: 46.24%
Recall: 46.24%


## Lancaster Logistic Regression

In [60]:
# Classified OH data
print('logistic Regression evaluation scores for c_data -------')
logistic_regression(x_train_lancaster_c_bio, x_test_lancaster_c_bio, y_train_lancaster_c_bio, y_test_lancaster_c_bio)
print()

# CDC data
print('logistic Regression evaluation scores for cdc_data -------')
logistic_regression(x_train_lancaster_cdc_bio, x_test_lancaster_cdc_bio, y_train_lancaster_cdc_bio, y_test_lancaster_cdc_bio)
print()

# Classified OH data
print('logistic Regression evaluation scores for class_data -------')
logistic_regression(x_train_lancaster_class_bio, x_test_lancaster_class_bio, y_train_lancaster_class_bio, y_test_lancaster_class_bio)

logistic Regression evaluation scores for c_data -------
Accuracy: 38.46%
F1: 15.62%
precision: 15.83%
Recall: 15.83%

logistic Regression evaluation scores for cdc_data -------
Accuracy: 62.73%
F1: 41.09%
precision: 43.33%
Recall: 43.33%

logistic Regression evaluation scores for class_data -------
Accuracy: 64.04%
F1: 43.73%
precision: 45.95%
Recall: 45.95%


## Postag Logistic Regression

In [61]:
# Classified OH data
print('logistic Regression evaluation scores for c_data -------')
logistic_regression(x_train_postag_c_bio, x_test_postag_c_bio, y_train_postag_c_bio, y_test_postag_c_bio)
print()

# CDC data
print('logistic Regression evaluation scores for cdc_data -------')
logistic_regression(x_train_postag_cdc_bio, x_test_postag_cdc_bio, y_train_postag_cdc_bio, y_test_postag_cdc_bio)
print()

# Classified OH data
print('logistic Regression evaluation scores for class_data -------')
logistic_regression(x_train_postag_class_bio, x_test_postag_class_bio, y_train_postag_class_bio, y_test_postag_class_bio)

logistic Regression evaluation scores for c_data -------
Accuracy: 53.85%
F1: 29.17%
precision: 28.12%
Recall: 28.12%

logistic Regression evaluation scores for cdc_data -------
Accuracy: 58.18%
F1: 38.35%
precision: 42.11%
Recall: 42.11%

logistic Regression evaluation scores for class_data -------
Accuracy: 57.02%
F1: 35.22%
precision: 38.56%
Recall: 38.56%


# **Decision Tree**

In [62]:
from sklearn.tree import DecisionTreeClassifier

In [63]:
def decision_tree(x_train, x_test, y_train, y_test):
    # Create a decision tree object and fit classifer
    clf = DecisionTreeClassifier(random_state=42).fit(x_train, y_train)

    # Make predictions
    y_pred = clf.predict(x_test)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, np.round(y_pred), average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')

    print(f'Accuracy: {accuracy * 100.0:.2f}%')
    print(f'F1: {f1 * 100.0:.2f}%')
    print(f'precision: {precision * 100.0:.2f}%')
    print(f'Recall: {precision * 100.0:.2f}%')

## Tokens Decision Tree

In [64]:
# Classified OH data
print('Decision Tree evaluation scores for c_data -------')
decision_tree(x_train_tokens_c_bio, x_test_tokens_c_bio, y_train_tokens_c_bio, y_test_tokens_c_bio)
print()

# CDC data
print('Decision Tree evaluation scores for cdc_data -------')
decision_tree(x_train_tokens_cdc_bio, x_test_tokens_cdc_bio, y_train_tokens_cdc_bio, y_test_tokens_cdc_bio)
print()

# Classified OH data
print('Decision Tree evaluation scores for class_data -------')
decision_tree(x_train_tokens_class_bio, x_test_tokens_class_bio, y_train_tokens_class_bio, y_test_tokens_class_bio)

Decision Tree evaluation scores for c_data -------
Accuracy: 46.15%
F1: 22.92%
precision: 27.08%
Recall: 27.08%

Decision Tree evaluation scores for cdc_data -------
Accuracy: 19.39%
F1: 9.88%
precision: 11.18%
Recall: 11.18%

Decision Tree evaluation scores for class_data -------
Accuracy: 21.05%
F1: 11.07%
precision: 13.25%
Recall: 13.25%


## Lemma Decision Tree

In [65]:
# Classified OH data
print('Decision Tree evaluation scores for c_data -------')
decision_tree(x_train_lemma_c_bio, x_test_lemma_c_bio, y_train_lemma_c_bio, y_test_lemma_c_bio)
print()

# CDC data
print('Decision Tree evaluation scores for cdc_data -------')
decision_tree(x_train_lemma_cdc_bio, x_test_lemma_cdc_bio, y_train_lemma_cdc_bio, y_test_lemma_cdc_bio)
print()

# Classified OH data
print('Decision Tree evaluation scores for class_data -------')
decision_tree(x_train_lemma_class_bio, x_test_lemma_class_bio, y_train_lemma_class_bio, y_test_lemma_class_bio)

Decision Tree evaluation scores for c_data -------
Accuracy: 38.46%
F1: 13.27%
precision: 14.81%
Recall: 14.81%

Decision Tree evaluation scores for cdc_data -------
Accuracy: 21.52%
F1: 12.11%
precision: 13.20%
Recall: 13.20%

Decision Tree evaluation scores for class_data -------
Accuracy: 22.22%
F1: 13.19%
precision: 13.69%
Recall: 13.69%


## Porter Decision Tree

In [66]:
# Classified OH data
print('Decision Tree evaluation scores for c_data -------')
decision_tree(x_train_porter_c_bio, x_test_porter_c_bio, y_train_porter_c_bio, y_test_porter_c_bio)
print()

# CDC data
print('Decision Tree evaluation scores for cdc_data -------')
decision_tree(x_train_porter_cdc_bio, x_test_porter_cdc_bio, y_train_porter_cdc_bio, y_test_porter_cdc_bio)
print()

# Classified OH data
print('Decision Tree evaluation scores for class_data -------')
decision_tree(x_train_porter_class_bio, x_test_porter_class_bio, y_train_porter_class_bio, y_test_porter_class_bio)

Decision Tree evaluation scores for c_data -------
Accuracy: 38.46%
F1: 12.78%
precision: 11.11%
Recall: 11.11%

Decision Tree evaluation scores for cdc_data -------
Accuracy: 19.09%
F1: 10.75%
precision: 13.06%
Recall: 13.06%

Decision Tree evaluation scores for class_data -------
Accuracy: 23.39%
F1: 11.79%
precision: 12.92%
Recall: 12.92%


## Lancaster Decision Tree

In [67]:
# Classified OH data
print('Decision Tree evaluation scores for c_data -------')
decision_tree(x_train_lancaster_c_bio, x_test_lancaster_c_bio, y_train_lancaster_c_bio, y_test_lancaster_c_bio)
print()

# CDC data
print('Decision Tree evaluation scores for cdc_data -------')
decision_tree(x_train_lancaster_cdc_bio, x_test_lancaster_cdc_bio, y_train_lancaster_cdc_bio, y_test_lancaster_cdc_bio)
print()

# Classified OH data
print('Decision Tree evaluation scores for class_data -------')
decision_tree(x_train_lancaster_class_bio, x_test_lancaster_class_bio, y_train_lancaster_class_bio, y_test_lancaster_class_bio)

Decision Tree evaluation scores for c_data -------
Accuracy: 15.38%
F1: 5.56%
precision: 7.41%
Recall: 7.41%

Decision Tree evaluation scores for cdc_data -------
Accuracy: 19.09%
F1: 9.08%
precision: 9.12%
Recall: 9.12%

Decision Tree evaluation scores for class_data -------
Accuracy: 22.51%
F1: 10.87%
precision: 12.02%
Recall: 12.02%


## Postag Decision Tree

In [68]:
# Classified OH data
print('Decision Tree evaluation scores for c_data -------')
decision_tree(x_train_postag_c_bio, x_test_postag_c_bio, y_train_postag_c_bio, y_test_postag_c_bio)
print()

# CDC data
print('Decision Tree evaluation scores for cdc_data -------')
decision_tree(x_train_postag_cdc_bio, x_test_postag_cdc_bio, y_train_postag_cdc_bio, y_test_postag_cdc_bio)
print()

# Classified OH data
print('Decision Tree evaluation scores for class_data -------')
decision_tree(x_train_postag_class_bio, x_test_postag_class_bio, y_train_postag_class_bio, y_test_postag_class_bio)

Decision Tree evaluation scores for c_data -------
Accuracy: 46.15%
F1: 37.36%
precision: 35.71%
Recall: 35.71%

Decision Tree evaluation scores for cdc_data -------
Accuracy: 20.61%
F1: 11.72%
precision: 11.84%
Recall: 11.84%

Decision Tree evaluation scores for class_data -------
Accuracy: 18.13%
F1: 9.65%
precision: 10.67%
Recall: 10.67%


# **Random Forest**

In [69]:
from sklearn.ensemble import RandomForestClassifier

In [70]:
def random_forest(x_train, x_test, y_train, y_test):
    # Create a random forest object and fit classifier
    rf_clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42).fit(x_train, y_train)

    # Make predictions
    y_pred = rf_clf.predict(x_test)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, np.round(y_pred), average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')

    print(f'Accuracy: {accuracy * 100.0:.2f}%')
    print(f'F1: {f1 * 100.0:.2f}%')
    print(f'precision: {precision * 100.0:.2f}%')
    print(f'Recall: {recall * 100.0:.2f}%')

## Tokens Random Forest

In [71]:
# Classified OH data
print('Random Forest evaluation scores for c_data -------')
random_forest(x_train_tokens_c_bio, x_test_tokens_c_bio, y_train_tokens_c_bio, y_test_tokens_c_bio)
print()

# CDC data
print('Random Forest evaluation scores for cdc_data -------')
random_forest(x_train_tokens_cdc_bio, x_test_tokens_cdc_bio, y_train_tokens_cdc_bio, y_test_tokens_cdc_bio)
print()

# Classified OH data
print('Random Forest evaluation scores for class_data -------')
random_forest(x_train_tokens_class_bio, x_test_tokens_class_bio, y_train_tokens_class_bio, y_test_tokens_class_bio)

Random Forest evaluation scores for c_data -------
Accuracy: 46.15%
F1: 19.09%
precision: 15.48%
Recall: 26.67%

Random Forest evaluation scores for cdc_data -------
Accuracy: 16.97%
F1: 4.40%
precision: 6.00%
Recall: 5.64%

Random Forest evaluation scores for class_data -------
Accuracy: 14.62%
F1: 3.48%
precision: 4.69%
Recall: 5.14%


## Lemma Random Forest

In [72]:
# Classified OH data
print('Random Forest evaluation scores for c_data -------')
random_forest(x_train_lemma_c_bio, x_test_lemma_c_bio, y_train_lemma_c_bio, y_test_lemma_c_bio)
print()

# CDC data
print('Random Forest evaluation scores for cdc_data -------')
random_forest(x_train_lemma_cdc_bio, x_test_lemma_cdc_bio, y_train_lemma_cdc_bio, y_test_lemma_cdc_bio)
print()

# Classified OH data
print('Random Forest evaluation scores for class_data -------')
random_forest(x_train_lemma_class_bio, x_test_lemma_class_bio, y_train_lemma_class_bio, y_test_lemma_class_bio)

Random Forest evaluation scores for c_data -------
Accuracy: 46.15%
F1: 15.83%
precision: 14.73%
Recall: 20.00%

Random Forest evaluation scores for cdc_data -------
Accuracy: 16.67%
F1: 3.89%
precision: 4.02%
Recall: 5.43%

Random Forest evaluation scores for class_data -------
Accuracy: 13.45%
F1: 3.01%
precision: 4.30%
Recall: 4.52%


## Porter Random Forest

In [73]:
# Classified OH data
print('Random Forest evaluation scores for c_data -------')
random_forest(x_train_porter_c_bio, x_test_porter_c_bio, y_train_porter_c_bio, y_test_porter_c_bio)
print()

# CDC data
print('Random Forest evaluation scores for cdc_data -------')
random_forest(x_train_porter_cdc_bio, x_test_porter_cdc_bio, y_train_porter_cdc_bio, y_test_porter_cdc_bio)
print()

# Classified OH data
print('Random Forest evaluation scores for class_data -------')
random_forest(x_train_porter_class_bio, x_test_porter_class_bio, y_train_porter_class_bio, y_test_porter_class_bio)

Random Forest evaluation scores for c_data -------
Accuracy: 38.46%
F1: 15.31%
precision: 19.05%
Recall: 20.00%

Random Forest evaluation scores for cdc_data -------
Accuracy: 16.06%
F1: 4.07%
precision: 6.45%
Recall: 5.31%

Random Forest evaluation scores for class_data -------
Accuracy: 13.45%
F1: 2.83%
precision: 4.38%
Recall: 4.57%


## Lancaster Random Forest

In [74]:
# Classified OH data
print('Random Forest evaluation scores for c_data -------')
random_forest(x_train_lancaster_c_bio, x_test_lancaster_c_bio, y_train_lancaster_c_bio, y_test_lancaster_c_bio)
print()

# CDC data
print('Random Forest evaluation scores for cdc_data -------')
random_forest(x_train_lancaster_cdc_bio, x_test_lancaster_cdc_bio, y_train_lancaster_cdc_bio, y_test_lancaster_cdc_bio)
print()

# Classified OH data
print('Random Forest evaluation scores for class_data -------')
random_forest(x_train_lancaster_class_bio, x_test_lancaster_class_bio, y_train_lancaster_class_bio, y_test_lancaster_class_bio)

Random Forest evaluation scores for c_data -------
Accuracy: 38.46%
F1: 15.71%
precision: 15.65%
Recall: 20.00%

Random Forest evaluation scores for cdc_data -------
Accuracy: 15.45%
F1: 4.16%
precision: 6.18%
Recall: 5.44%

Random Forest evaluation scores for class_data -------
Accuracy: 13.74%
F1: 3.54%
precision: 5.11%
Recall: 5.18%


## Postag Random Forest

In [75]:
# Classified OH data
print('Random Forest evaluation scores for c_data -------')
random_forest(x_train_postag_c_bio, x_test_postag_c_bio, y_train_postag_c_bio, y_test_postag_c_bio)
print()

# CDC data
print('Random Forest evaluation scores for cdc_data -------')
random_forest(x_train_postag_cdc_bio, x_test_postag_cdc_bio, y_train_postag_cdc_bio, y_test_postag_cdc_bio)
print()

# Classified OH data
print('Random Forest evaluation scores for class_data -------')
random_forest(x_train_postag_class_bio, x_test_postag_class_bio, y_train_postag_class_bio, y_test_postag_class_bio)

Random Forest evaluation scores for c_data -------
Accuracy: 61.54%
F1: 35.24%
precision: 32.86%
Recall: 40.00%

Random Forest evaluation scores for cdc_data -------
Accuracy: 12.12%
F1: 2.26%
precision: 2.59%
Recall: 3.25%

Random Forest evaluation scores for class_data -------
Accuracy: 9.94%
F1: 2.06%
precision: 3.99%
Recall: 3.22%


# **SVM**

In [76]:
from sklearn import svm

In [77]:
def svm_classifier(x_train, x_test, y_train, y_test):
    # Create support vector machine object and fit classifer
    svm_clf = svm.SVC(kernel='linear', random_state=42).fit(x_train, y_train)

    # Make predictions
    y_pred = svm_clf.predict(x_test)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, np.round(y_pred), average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')

    print(f'Accuracy: {accuracy * 100.0:.2f}%')
    print(f'F1: {f1 * 100.0:.2f}%')
    print(f'precision: {precision * 100.0:.2f}%')
    print(f'Recall: {recall * 100.0:.2f}%')

## Tokens SVM

In [78]:
# Classified OH data
print('SVM evaluation scores for c_data -------')
svm_classifier(x_train_tokens_c_bio, x_test_tokens_c_bio, y_train_tokens_c_bio, y_test_tokens_c_bio)
print()

# CDC data
print('SVM evaluation scores for cdc_data -------')
svm_classifier(x_train_tokens_cdc_bio, x_test_tokens_cdc_bio, y_train_tokens_cdc_bio, y_test_tokens_cdc_bio)
print()

# Classified OH data
print('SVM evaluation scores for class_data -------')
svm_classifier(x_train_tokens_class_bio, x_test_tokens_class_bio, y_train_tokens_class_bio, y_test_tokens_class_bio)

SVM evaluation scores for c_data -------
Accuracy: 53.85%
F1: 30.21%
precision: 31.25%
Recall: 32.50%

SVM evaluation scores for cdc_data -------
Accuracy: 62.42%
F1: 40.41%
precision: 41.74%
Recall: 42.79%

SVM evaluation scores for class_data -------
Accuracy: 64.33%
F1: 41.63%
precision: 45.58%
Recall: 42.88%


## Lemma SVM

In [79]:
# Classified OH data
print('SVM evaluation scores for c_data -------')
svm_classifier(x_train_lemma_c_bio, x_test_lemma_c_bio, y_train_lemma_c_bio, y_test_lemma_c_bio)
print()

# CDC data
print('SVM evaluation scores for cdc_data -------')
svm_classifier(x_train_lemma_cdc_bio, x_test_lemma_cdc_bio, y_train_lemma_cdc_bio, y_test_lemma_cdc_bio)
print()

# Classified OH data
print('SVM evaluation scores for class_data -------')
svm_classifier(x_train_lemma_class_bio, x_test_lemma_class_bio, y_train_lemma_class_bio, y_test_lemma_class_bio)

SVM evaluation scores for c_data -------
Accuracy: 53.85%
F1: 29.37%
precision: 30.36%
Recall: 32.50%

SVM evaluation scores for cdc_data -------
Accuracy: 64.85%
F1: 42.73%
precision: 44.11%
Recall: 44.06%

SVM evaluation scores for class_data -------
Accuracy: 64.33%
F1: 43.17%
precision: 48.41%
Recall: 43.37%


## Porter SVM

In [80]:
# Classified OH data
print('SVM evaluation scores for c_data -------')
svm_classifier(x_train_porter_c_bio, x_test_porter_c_bio, y_train_porter_c_bio, y_test_porter_c_bio)
print()

# Classified OH data
print('SVM evaluation scores for c_data -------')
svm_classifier(x_train_porter_c_bio, x_test_porter_c_bio, y_train_porter_c_bio, y_test_porter_c_bio)
print()

# CDC data
print('SVM evaluation scores for cdc_data -------')
svm_classifier(x_train_porter_cdc_bio, x_test_porter_cdc_bio, y_train_porter_cdc_bio, y_test_porter_cdc_bio)

SVM evaluation scores for c_data -------
Accuracy: 53.85%
F1: 26.07%
precision: 27.50%
Recall: 26.00%

SVM evaluation scores for c_data -------
Accuracy: 53.85%
F1: 26.07%
precision: 27.50%
Recall: 26.00%

SVM evaluation scores for cdc_data -------
Accuracy: 63.03%
F1: 39.65%
precision: 41.75%
Recall: 41.69%


## Lancaster SVM

In [81]:
# Classified OH data
print('SVM evaluation scores for c_data -------')
svm_classifier(x_train_lancaster_c_bio, x_test_lancaster_c_bio, y_train_lancaster_c_bio, y_test_lancaster_c_bio)
print()

# CDC data
print('SVM evaluation scores for cdc_data -------')
svm_classifier(x_train_lancaster_cdc_bio, x_test_lancaster_cdc_bio, y_train_lancaster_cdc_bio, y_test_lancaster_cdc_bio)
print()

# Classified OH data
print('SVM evaluation scores for class_data -------')
svm_classifier(x_train_lancaster_class_bio, x_test_lancaster_class_bio, y_train_lancaster_class_bio, y_test_lancaster_class_bio)

SVM evaluation scores for c_data -------
Accuracy: 46.15%
F1: 25.79%
precision: 28.89%
Recall: 26.67%

SVM evaluation scores for cdc_data -------
Accuracy: 59.39%
F1: 38.56%
precision: 40.63%
Recall: 41.06%

SVM evaluation scores for class_data -------
Accuracy: 65.20%
F1: 47.00%
precision: 49.45%
Recall: 48.82%


## Postag SVM

In [82]:
# Classified OH data
print('SVM evaluation scores for c_data -------')
svm_classifier(x_train_postag_c_bio, x_test_postag_c_bio, y_train_postag_c_bio, y_test_postag_c_bio)
print()

# CDC data
print('SVM evaluation scores for cdc_data -------')
svm_classifier(x_train_postag_cdc_bio, x_test_postag_cdc_bio, y_train_postag_cdc_bio, y_test_postag_cdc_bio)
print()

# Classified OH data
print('SVM evaluation scores for class_data -------')
svm_classifier(x_train_postag_class_bio, x_test_postag_class_bio, y_train_postag_class_bio, y_test_postag_class_bio)

SVM evaluation scores for c_data -------
Accuracy: 46.15%
F1: 24.07%
precision: 24.07%
Recall: 26.67%

SVM evaluation scores for cdc_data -------
Accuracy: 55.76%
F1: 34.89%
precision: 36.90%
Recall: 36.63%

SVM evaluation scores for class_data -------
Accuracy: 55.56%
F1: 36.55%
precision: 39.36%
Recall: 37.73%
