In [6]:
import pandas as pd

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Import and Preprocess/Format Data

In [2]:
# Function which does everything below at once
def open_format_data(filename):
  # Save data from txt into list
  f = open(filename, "r")

  list_data = []
  for x in f:
      list_data.append(f.readline().rstrip().split("\t"))

  # Load list into dataframe
  column_names = ['label', "sentence"]
  df = pd.DataFrame(list_data, columns= column_names)
  df.head(10)

  # Drop any null values
  df = df.dropna()

  # Drop duplicates
  df = df.drop_duplicates()

  # Append number representing label as code
  df.label = pd.Categorical(df.label)
  df['code'] = df.label.cat.codes
  return df

In [3]:
df = open_format_data("train.txt")

In [None]:
# Save data from txt into list
f = open("/content/drive/My Drive/Griffith/Data Mining/train.txt", "r")

list_data = []
for x in f:
    list_data.append(f.readline().rstrip().split("\t"))

print(list_data[:5])

In [None]:
# Load list into dataframe
column_names = ['label', "sentence"]
df = pd.DataFrame(list_data, columns= column_names)
df.head(10)

In [None]:
# Count all null values
df.isna().sum()

In [None]:
# Drop any null values
df = df.dropna()
df.isna().sum()

In [None]:
# Detect duplicates
df.duplicated().sum()

In [None]:
# Drop duplicates
df = df.drop_duplicates()
df.duplicated().sum()

In [None]:
# Append number representing label as code
df.label = pd.Categorical(df.label)
df['code'] = df.label.cat.codes
df.head()

# Data Exploration

In [None]:
df.count()

In [None]:
df.dtypes

In [None]:
df.groupby("label").count()

In [None]:
df.groupby("label")['sentence'].count().plot.bar()
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()

# Tensorflow Data Formatting and Encoding

In [None]:
# Load into tensorflow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((df.sentence, df.code))
for b in train_dataset.take(5):
  print(b)

In [None]:
# Generate vocab
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in train_dataset:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
# Increase vocab size for padding value (0)
vocab_size += 1
vocab_size

In [None]:
# Setup encoder
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int8))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label

In [None]:
example_text = next(iter(train_dataset))[0].numpy()
example_text = next(iter(train_dataset))[0].numpy()
print(example_text)
encoded_example = encoder.encode(example_text)
print(encoded_example)

In [None]:
# Encode sentences
all_encoded_data = train_dataset.map(encode_map_fn)
for ex in all_encoded_data.take(5):
  print(ex)

In [None]:
# Shuffle into train/test data and add 
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 2000

train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)

In [None]:
sample_text, sample_labels = next(iter(train_data))

sample_text[0], encoder.decode(sample_text[0].numpy()), sample_labels[0]

# Tensorflow Basic Model

In [None]:
# Build model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(5))

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_data, epochs=3, validation_data=test_data)

In [None]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

# Tensorflow Word Embedding

In [None]:
embedding_layer = layers.Embedding(1000, 5)
result = embedding_layer(tf.constant([1,2,3]))
result.numpy()

In [None]:
embedding_dim=16

model = keras.Sequential([
  layers.Embedding(vocab_size, embedding_dim),
  layers.GlobalAveragePooling1D(),
  # layers.Dense(16, activation='relu'),
  layers.Dense(5)
])

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=3,
    validation_data=test_data, validation_steps=20)

In [None]:
# Retrive the learned embeddings
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

# Save to disk
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(encoder.tokens):
  vec = weights[num+1] # skip 0, it's padding.
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

# Download to PC
try:
  from google.colab import files
except ImportError:
   pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

# SciBERT Model For Sentence Embeddings


In [2]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

In [5]:
scibert_model = BertModel.from_pretrained("allenai/scibert_scivocab_uncased",
                                  output_hidden_states=True)
scibert_tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

print('scibert_tokenizer is type:', type(scibert_tokenizer))
print('    scibert_model is type:', type(scibert_model))

scibert_tokenizer is type: <class 'transformers.tokenization_bert.BertTokenizer'>
    scibert_model is type: <class 'transformers.modeling_bert.BertModel'>


In [6]:
def get_word_indeces(tokenizer, text, word):
    '''
    Determines the index or indeces of the tokens corresponding to `word`
    within `text`. `word` can consist of multiple words, e.g., "cell biology".
    
    Determining the indeces is tricky because words can be broken into multiple
    tokens. I've solved this with a rather roundabout approach--I replace `word`
    with the correct number of `[MASK]` tokens, and then find these in the 
    tokenized result. 
    '''
    # Tokenize the 'word'--it may be broken into multiple tokens or subwords.
    word_tokens = tokenizer.tokenize(word)

    # Create a sequence of `[MASK]` tokens to put in place of `word`.
    masks_str = ' '.join(['[MASK]']*len(word_tokens))

    # Replace the word with mask tokens.
    text_masked = text.replace(word, masks_str)

    # `encode` performs multiple functions:
    #   1. Tokenizes the text
    #   2. Maps the tokens to their IDs
    #   3. Adds the special [CLS] and [SEP] tokens.
    input_ids = tokenizer.encode(text_masked)

    # Use numpy's `where` function to find all indeces of the [MASK] token.
    mask_token_indeces = np.where(np.array(input_ids) == tokenizer.mask_token_id)[0]

    return mask_token_indeces

In [7]:
def get_embedding(b_model, b_tokenizer, text):
    '''
    Uses the provided model and tokenizer to produce an embedding for the
    provided `text`
    '''

    # Encode the text, adding the (required!) special tokens, and converting to
    # PyTorch tensors.
    encoded_dict = b_tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        return_tensors = 'pt',     # Return pytorch tensors.
                )

    input_ids = encoded_dict['input_ids']
    
    b_model.eval()

    # Run the text through the model and get the hidden states.
    bert_outputs = b_model(input_ids)
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs = b_model(input_ids)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]

    # `hidden_states` has shape [13 x 1 x <sentence length> x 768]

    # Select the embeddings from the second to last layer.
    # `token_vecs` is a tensor with shape [<sent length> x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)

    # Convert to numpy array.
    sentence_embedding = sentence_embedding.detach().numpy()

    return sentence_embedding

In [8]:
text = "hydrogels are hydrophilic polymer networks which may absorb from 10–20% (an arbitrary lower limit) up to thousands of times their dry weight in water."

# Get the embedding for the sentence, as well as an embedding for 'hydrogels'.
sen_emb = get_embedding(scibert_model, scibert_tokenizer, text)

print('Embedding sizes:')
print(sen_emb.shape)

Embedding sizes:
(768,)


Look into using TF cause this is slow


In [10]:
from IPython.display import clear_output
import timeit

embeddings = []
length = len(df['sentence'].tolist())
index = 0

start = timeit.default_timer()
for sentence in df['sentence'].tolist():
    clear_output(wait=True)
    index += 1
    sen_emb = get_embedding(scibert_model, scibert_tokenizer, sentence)
    embeddings.append(sen_emb)

    stop = timeit.default_timer()

    if (index/length*100) < 1:
        expected_time = "Calculating..."

    else:
        time_perc = timeit.default_timer()
        expected_time = np.round( (time_perc-start) /(index/length) /60,2)

    print(index, length)
    print(expected_time)

print(len(embeddings))

88797 88797
139.64
88797


In [11]:
df.head()

Unnamed: 0,label,sentence,code
0,OBJECTIVE,To investigate the efficacy of @ weeks of dail...,3
1,METHODS,Outcome measures included pain reduction and i...,2
2,METHODS,Secondary outcome measures included the Wester...,2
3,RESULTS,There was a clinically relevant reduction in t...,4
4,RESULTS,"Further , there was a clinically relevant redu...",4


In [12]:
df['scibert'] = embeddings
df.head()

Unnamed: 0,label,sentence,code,scibert
0,OBJECTIVE,To investigate the efficacy of @ weeks of dail...,3,"[-0.44510403, -0.31423956, -0.45745727, 0.4200..."
1,METHODS,Outcome measures included pain reduction and i...,2,"[-0.4775299, -0.46893463, -0.22414015, 0.22520..."
2,METHODS,Secondary outcome measures included the Wester...,2,"[-0.61925375, -0.19193889, -0.38404435, 0.2082..."
3,RESULTS,There was a clinically relevant reduction in t...,4,"[-0.5922383, -0.41908544, -0.33248687, 0.55146..."
4,RESULTS,"Further , there was a clinically relevant redu...",4,"[-0.38201377, -0.53494513, 0.1772025, 0.381558..."


In [13]:
df.to_pickle("./df_embeddings.pkl")

# Investigating SciBert Embedded Models

In [12]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
df = pd.read_pickle("./df_embeddings.pkl")
df.head(20)

Unnamed: 0,label,sentence,code,scibert
0,OBJECTIVE,To investigate the efficacy of @ weeks of dail...,3,"[-0.44510403, -0.31423956, -0.45745727, 0.4200..."
1,METHODS,Outcome measures included pain reduction and i...,2,"[-0.4775299, -0.46893463, -0.22414015, 0.22520..."
2,METHODS,Secondary outcome measures included the Wester...,2,"[-0.61925375, -0.19193889, -0.38404435, 0.2082..."
3,RESULTS,There was a clinically relevant reduction in t...,4,"[-0.5922383, -0.41908544, -0.33248687, 0.55146..."
4,RESULTS,"Further , there was a clinically relevant redu...",4,"[-0.38201377, -0.53494513, 0.1772025, 0.381558..."
5,RESULTS,The Outcome Measures in Rheumatology Clinical ...,4,"[0.027787112, 0.23189485, -0.3841277, 0.625988..."
7,BACKGROUND,Emotional eating is associated with overeating...,0,"[-0.6055013, -0.14416914, -0.32580143, 0.12642..."
8,OBJECTIVE,The aim of this study was to test if attention...,3,"[-0.37356007, 0.10462127, -0.011478976, -0.169..."
9,METHODS,Participants ( N = @ ) were randomly assigned ...,2,"[0.23434164, 0.32432234, -0.29593608, -0.39225..."
10,METHODS,Self-reported emotional eating was assessed wi...,2,"[-0.65840876, 0.21802892, -0.24623804, -0.1824..."


In [10]:
# Export to weka
export_df = df.drop(['label', 'sentence'], axis=1)[:2500]

def f(x):
    return list(x.tolist())

pd.DataFrame(df['scibert'][:2500].tolist(), index= df.index[:2500])

export_df['scibert'] = pd.DataFrame(export_df['scibert'].tolist(), index= export_df.index)

print(export_df.values)

import arff
arff.dump('filename.arff'
      , export_df.values
      , relation='relation name'
      , names=['code', 'scibert'])

[[3
  list([-0.44510403275489807, -0.3142395615577698, -0.4574572741985321, 0.42003002762794495, 0.17308373749256134, -0.0658547654747963, 0.0610351599752903, 0.7119576334953308, 0.052800457924604416, -0.04166654124855995, 0.25368425250053406, 0.023948971182107925, -0.5013481974601746, 0.3400881886482239, 0.07037530839443207, 0.16391821205615997, 0.12615199387073517, -0.22290515899658203, 0.11534956842660904, 0.14633360505104065, -0.1456236094236374, 0.30225151777267456, -0.13457921147346497, -0.15234455466270447, 0.5951424837112427, -0.4410097301006317, 0.0134934913367033, -0.18890079855918884, 0.07016326487064362, -0.5255907773971558, 0.34734436869621277, -0.17779316008090973, -0.38158607482910156, -0.38548311591148376, -0.40820297598838806, 0.7804317474365234, 0.13236048817634583, -0.20911121368408203, -0.2571874260902405, -0.41668063402175903, -0.10891936719417572, 0.2499328851699829, -0.2740243375301361, 0.034503862261772156, 0.14481700956821442, -0.31543558835983276, 0.2689055800

ValueError: Unknown type: <class 'list'>

In [5]:
# Sample even number from each class
even_df = df.groupby('code').apply(lambda x: x.sample(n=500)).reset_index(drop = True)
even_df.groupby("label").count()

Unnamed: 0_level_0,sentence,code,scibert
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BACKGROUND,500,500,500
CONCLUSIONS,500,500,500
METHODS,500,500,500
OBJECTIVE,500,500,500
RESULTS,500,500,500


In [6]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model_even = KNeighborsClassifier()
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))

F1 Score
0.6819527403960083
0.654013853094385
MCC
0.700766156771334
0.652233775495782
Accuracy
0.778
0.733


In [7]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model_even = BernoulliNB()
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))

F1 Score
0.6681035538397584
0.6709305896250097
MCC
0.6634952331940656
0.6558606143364742
Accuracy
0.747
0.741


In [8]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model_even = GaussianNB()
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))

F1 Score
0.6969765214942665
0.680103550567688
MCC
0.6918251118653097
0.6686883947774092
Accuracy
0.767
0.75


In [9]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_even = DecisionTreeClassifier()
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))

F1 Score
0.46094697692739006
0.4676075892099999
MCC
0.4103338396416756
0.38655254976388786
Accuracy
0.559
0.525


In [10]:
from sklearn.svm import LinearSVC
model = LinearSVC(multi_class="ovr", max_iter=1000)
model_even = LinearSVC(multi_class="ovr", max_iter=1000)
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))



F1 Score
0.6931498071426793
0.6671190236890402
MCC
0.7125171448205804
0.6789894636994003
Accuracy
0.787
0.758




In [64]:
from sklearn.svm import SVC
model = SVC(kernel='rbf', gamma=0.02, C=2)
# model_even = SVC()
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
# model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
# pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(confusion_matrix(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))

F1 Score
0.742674548713269
0.7522561109618942
MCC
0.7866132899786072
[[ 89   8  12  16   2]
 [ 20 117   5   0  17]
 [  4   2 324   2   8]
 [ 31   5   7  22   1]
 [  0   5  13   0 290]]
0.757602626006683
Accuracy
0.842
0.819


In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle


# >> FEATURE SELECTION << #
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped


def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped


##############################


# >> MODEL TRAINING << #
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost


# I haven't tested it but this same function should work for
# vanilla and mini-batch gradient descent as well
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw


def sgd(features, outputs):
    max_epochs = 5000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights


########################


def init():
    diag_map = {0: 1.0, 1: -1.0, 2: -1.0, 3: -1.0, 4: -1.0}

    # put features & outputs in different data frames
    Y = pd.DataFrame(df['code'][:2500].map(diag_map))
    X = pd.DataFrame(df['scibert'][:2500].tolist(), index= df.index[:2500])

    # filter features
    remove_correlated_features(X)
    remove_less_significant_features(X, Y)

    # normalize data for better convergence and to prevent overflow
    X_normalized = MinMaxScaler().fit_transform(X.values)
    X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    X.insert(loc=len(X.columns), column='intercept', value=1)

    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))

    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)

    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))


# set hyper-parameters and call init
regularization_strength = 10000
learning_rate = 0.000001
init()

In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class="ovr", max_iter=1000)
model_even = LogisticRegression(multi_class="ovr", max_iter=1000)
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))
print(confusion_matrix(df['code'][2500:3500].tolist(), pred_even))

F1 Score
0.7231861816607512
0.7153398986547954
MCC
0.7527761095624504
0.7345170631965806
Accuracy
0.817
0.802
[[ 71  16   3  32   5]
 [ 16 113   3  10  17]
 [ 13   0 309   5  13]
 [ 19   6   5  35   1]
 [  1  14  18   1 274]]


In [16]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model_even = MLPClassifier()
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))

F1 Score
0.7058321377086436
0.7256700391541749
MCC
0.7370274149726393
0.7448034226562301
Accuracy
0.805
0.809


In [17]:
from sklearn.linear_model import Perceptron
model = Perceptron()
model_even = Perceptron()
# Fit models
model.fit(df['scibert'][:2500].tolist(), df['code'][:2500].tolist())
model_even.fit(even_df['scibert'].tolist(), even_df['code'].tolist())
# Predictions for each model
pred = model.predict(df['scibert'][2500:3500].tolist())
pred_even = model_even.predict(df['scibert'][2500:3500].tolist())
# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print(f1_score(df['code'][2500:3500].tolist(), pred_even, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred_even))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print(accuracy_score(df['code'][2500:3500].tolist(), pred_even))

F1 Score
0.6976264130136529
0.7122919005635008
MCC
0.7286235564447606
0.7298053612649003
Accuracy
0.798
0.799


In [140]:
# Tensorflow NN
import numpy as np

# Build model
model = tf.keras.Sequential()

model.add(tf.keras.layers.Dense(768, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(64, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(5))

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

train_em = df['scibert'][:2500]
train_em = np.array(train_em.tolist())
train_label = df['code'][:2500]
train_label = np.array(train_label.tolist())

test_em = df['scibert'][2500:3500]
test_em = np.array(test_em.tolist())
test_label = df['code'][2500:3500]
test_label = np.array(test_label.tolist())

# Fit Model
model.fit(train_em, train_label, epochs=4, validation_data=(test_em, test_label))

eval_loss, eval_acc = model.evaluate(test_em, test_label)
print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

pred = model.predict(test_em).argmax(axis=-1)

# Evaluate
print("F1 Score")
print(f1_score(df['code'][2500:3500].tolist(), pred, average="macro"))
print("MCC")
print(matthews_corrcoef(df['code'][2500:3500].tolist(), pred))
print("Accuracy")
print(accuracy_score(df['code'][2500:3500].tolist(), pred))
print("Confusion Matrix")
print(confusion_matrix(df['code'][2500:3500].tolist(), pred))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Eval loss: 0.517, Eval accuracy: 0.822
F1 Score
0.7312273609053561
MCC
0.7599122337675305
Accuracy
0.822
Confusion Matrix
[[ 86  12   9  16   4]
 [ 21 111   5   3  19]
 [ 10   1 319   3   7]
 [ 29   5   5  26   1]
 [  0   7  21   0 280]]


# Distillibert


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
batch_1 = df[:256]

In [None]:
batch_1.head()

In [None]:
batch_1['code'].value_counts()

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
# For SciBert
scibert_model = BertModel.from_pretrained("allenai/scibert_scivocab_uncased",
                                  output_hidden_states=True)
scibert_tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

print('scibert_tokenizer is type:', type(scibert_tokenizer))
print('    scibert_model is type:', type(scibert_model))

In [None]:
tokenized = batch_1['sentence'].apply((lambda x: scibert_tokenizer.encode(x, add_special_tokens=True)))

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:
cuda = torch.device('cuda')     # Default CUDA devicecuda0
cuda0 = torch.device('cuda:0')

In [None]:
input_ids = torch.tensor(padded)
# input_ids = input_ids.to(cuda0)
attention_mask = torch.tensor(attention_mask)
# attention_mask = attention_mask.to(cuda0)

In [None]:
with torch.no_grad():
    features = scibert_model(input_ids, attention_mask=attention_mask)[0][:,0,:].numpy()

In [None]:
features = last_hidden_states

In [None]:
labels = batch_1['code']

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
train_features.shape

In [18]:
from sklearn.neighbors import KNeighborsClassifier
lr_clf = KNeighborsClassifier()
lr_clf.fit(df['scibert'][:1000].tolist(), df['code'][:1000].tolist())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [20]:
lr_clf.score(df['scibert'][1000:2000].tolist(), df['code'][1000:2000].tolist())

0.738