In [1]:
# import relevant packages
import numpy as np
from collections import defaultdict
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
import numpy as np
from tensorflow.keras.layers import Dense
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow_addons as tfa

  from .autonotebook import tqdm as notebook_tqdm

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



##### Phrase similarity with static embeddings

In [2]:
dataset = load_dataset("PiC/phrase_similarity")

In [3]:
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

In [4]:
train_data[:5]

{'phrase1': ['newly formed camp',
  'one data',
  'particular structure',
  'involved people',
  'different cross'],
 'phrase2': ['recently made encampment',
  'a particular statistic',
  'specific edifice',
  'participating individuals',
  'opposing inquiries'],
 'sentence1': ['newly formed camp is released from the membrane and diffuses across the intracellular space where it serves to activate pka.',
  "According to one data, in 1910, on others – in 1915, the mansion became Natalya Dmitriyevna Shchuchkina's property.",
  'Note that Fact 1 does not assume any particular structure on the set formula_65.',
  'Assessment-Center are usually group-processes with high validity and acceptance of the involved people.',
  'At the end of the 1980s, a different cross had been placed on the roof.'],
 'sentence2': ['recently made encampment is released from the membrane and diffuses across the intracellular space where it serves to activate pka.',
  "According to a particular statistic, in 1910, 

In [5]:
len(train_data)

7004

In [6]:
def load_glove_embeddings(path):
    with open(path, 'r', encoding='utf-8') as f:
        embeddings = {}
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [7]:
path = 'glove.6B/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(path=path)

In [8]:
def get_cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

We are computing the phrase embeddings by calculating the mean of the embeddings of each word to start of with

In [9]:
def get_phrase_embedding(phrase, embeddings_dict):
    embeddings = [embeddings_dict[word.lower()] for word in phrase.split() if word.lower() in embeddings_dict]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(next(iter(embeddings_dict.values())).shape)

In [10]:
def get_embeddings_and_labels(data, glove_embeddings):
    embeddings = []
    labels = []
    for example in data:
        phrase1_embedding = get_phrase_embedding(example['phrase1'], glove_embeddings)
        phrase2_embedding = get_phrase_embedding(example['phrase2'], glove_embeddings)
        embedding = np.concatenate((phrase1_embedding, phrase2_embedding))
        embeddings.append(embedding)
        labels.append(example['label'])
    return np.array(embeddings), np.array(labels)

In [11]:
word_freq = defaultdict(int)
for sentences in (train_data['phrase1'] + train_data['phrase2'] + \
                  test_data['phrase1'] + test_data['phrase2'] + \
                    valid_data['phrase1'] + valid_data['phrase2']):
    for word in sentences.split():
        word_freq[word.lower()] += 1
total_words = sum(word_freq.values())
word_prob = {word: freq / total_words for word, freq in word_freq.items()}

In [12]:
def get_phrase_embedding_sif(phrase, embeddings_dict, word_prob, a=1e-3):
    embeddings = []
    weights = []
    for word in phrase.split():
        if word.lower() in embeddings_dict and word.lower() in word_prob:
            embeddings.append(embeddings_dict[word.lower()])
            weight = a / (a + word_prob[word.lower()])
            weights.append(weight)
    if embeddings:
        weighted_embeddings = np.average(embeddings, axis=0, weights=weights)
        return weighted_embeddings
    else:
        return np.zeros(next(iter(embeddings_dict.values())).shape)

In [13]:
def get_embeddings_and_labels_sif(data, glove_embeddings):
    embeddings = []
    labels = []
    for example in data:
        phrase1_embedding = get_phrase_embedding_sif(example['phrase1'], glove_embeddings, word_prob=word_prob)
        phrase2_embedding = get_phrase_embedding_sif(example['phrase2'], glove_embeddings, word_prob=word_prob)
        embedding = np.concatenate((phrase1_embedding, phrase2_embedding))
        embeddings.append(embedding)
        labels.append(example['label'])
    return np.array(embeddings), np.array(labels)

In [14]:
all_phrases = train_data['phrase1'] + train_data['phrase2'] + \
                valid_data['phrase1'] + valid_data['phrase2'] + \
                test_data['phrase1'] + test_data['phrase2']
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_phrases)
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = dict(zip(feature_names, vectorizer.idf_))

In [15]:
def get_phrase_embedding_tfidf(phrase, embeddings_dict, tfidf_scores):
    embeddings = []
    weights = []
    for word in phrase.split():
        if word.lower() in embeddings_dict and word.lower() in tfidf_scores:
            embeddings.append(embeddings_dict[word.lower()])
            weight = tfidf_scores[word.lower()]
            weights.append(weight)
    if embeddings:
        if not weights:
            weights = [1] * len(embeddings)
        weighted_embeddings = np.average(embeddings, axis=0, weights=weights)
        return weighted_embeddings
    else:
        return np.zeros(next(iter(embeddings_dict.values())).shape)

In [16]:
def get_embeddings_and_labels_tfidf(data, embeddings_dict, a=1e-3):
    embeddings = []
    labels = []

    for phrase1, phrase2, label in zip(data['phrase1'], data['phrase2'], data['label']):
        phrase1_embedding = get_phrase_embedding_tfidf(phrase1, embeddings_dict, tfidf_scores)
        phrase2_embedding = get_phrase_embedding_tfidf(phrase2, embeddings_dict, tfidf_scores)

        embedding = np.concatenate((phrase1_embedding, phrase2_embedding))
        embeddings.append(embedding)
        labels.append(label)

    return np.array(embeddings), np.array(labels)

In [17]:
def get_eval_metrics(actual, predictions):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    f1 = f1_score(actual, predictions)

    return accuracy, precision, recall, f1

Using a simple cosine similarity function and figuring out the threshold by optimising for the best threshold

In [18]:
best_threshold = None
best_metric_f1 = -1
best_metric_accuracy = -1
best_metric_recall = -1
best_metric_precision = -1

# Create thresholds from 0 to 2, inclusive, in increments of 0.05
thresholds = [i * 0.05 for i in range(1, 41)]
for threshold in thresholds:
    predictions = []
    for p1, p2 in zip(train_data['phrase1'], train_data['phrase2']):
        emb1 = get_phrase_embedding_tfidf(p1, glove_embeddings, word_prob)
        emb2 = get_phrase_embedding_tfidf(p2, glove_embeddings, word_prob)
        similarity = get_cosine_similarity(emb1, emb2)
        pred = 1 if similarity > threshold else 0
        predictions.append(pred)
    accuracy, precision, recall, f1 = get_eval_metrics(train_data['label'], predictions)
    if accuracy > best_metric_accuracy:
        best_metric_f1 = f1
        best_metric_accuracy = accuracy
        best_metric_precision = precision
        best_metric_recall = recall
        best_threshold = threshold
print(f"The best threshold is {best_threshold}")

print(f"Training Accuracy: {best_metric_accuracy:.4f}")
print(f"Training Precision: {best_metric_precision:.4f}")
print(f"Training Recall: {best_metric_recall:.4f}")
print(f"Training F1-score: {best_metric_f1:.4f}")

  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  dist = 1.0 - uv / math.sqrt(uu * vv)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  dist = 1.0 - uv / math.sqrt(uu * vv)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  dist = 1.0 - uv / math.sqrt(uu * 

The best threshold is 0.55
Training Accuracy: 0.5107
Training Precision: 0.5200
Training Recall: 0.2784
Training F1-score: 0.3627


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The best threshold with training set is coming out to be 0.05. Lets use this to see what sort of a score we would get with the validation set.

In [19]:
predictions = []
threshold = 0.65

for p1, p2 in zip(test_data['phrase1'], test_data['phrase2']):
    emb1 = get_phrase_embedding_tfidf(p1, glove_embeddings, word_prob)
    emb2 = get_phrase_embedding_tfidf(p2, glove_embeddings, word_prob)
    similarity = get_cosine_similarity(emb1, emb2)
    pred = 1 if similarity > threshold else 0
    predictions.append(pred)

actual = test_data['label']
accuracy, precision, recall, f1 = get_eval_metrics(actual, predictions)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")

Test Accuracy: 0.5105
Test Precision: 0.5517
Test Recall: 0.1120
Test F1-score: 0.1862


  dist = 1.0 - uv / math.sqrt(uu * vv)


In [20]:
train_embeddings, train_labels = get_embeddings_and_labels(train_data, glove_embeddings)
model = LogisticRegression(
    C=10,
    penalty='l1',
    solver='liblinear',
    max_iter=500,
    tol=1e-5,
    class_weight='balanced',
    random_state=42
)
model.fit(train_embeddings, train_labels)

train_predictions = model.predict(train_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(train_labels, train_predictions)
print(f"Training Accuracy: {accuracy:.4f}")
print(f"Training Precision: {precision:.4f}")
print(f"Training Recall: {recall:.4f}")
print(f"Training F1-score: {f1:.4f}")

valid_embeddings, valid_labels = get_embeddings_and_labels(valid_data, glove_embeddings)
valid_predictions = model.predict(valid_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(valid_labels, valid_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-score: {f1:.4f}")

test_embeddings, test_labels = get_embeddings_and_labels(test_data, glove_embeddings)
test_predictions = model.predict(test_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(test_labels, test_predictions)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")

Training Accuracy: 0.5651
Training Precision: 0.5645
Training Recall: 0.5700
Training F1-score: 0.5672
Validation Accuracy: 0.3710
Validation Precision: 0.3733
Validation Recall: 0.3800
Validation F1-score: 0.3766
Test Accuracy: 0.3540
Test Precision: 0.3574
Test Recall: 0.3660
Test F1-score: 0.3617


In [21]:
train_embeddings, train_labels = get_embeddings_and_labels_sif(train_data, glove_embeddings)
model = LogisticRegression(
    C=10,
    penalty='l1',
    solver='liblinear',
    max_iter=500,
    tol=1e-5,
    class_weight='balanced',
    random_state=42
)
model.fit(train_embeddings, train_labels)

train_predictions = model.predict(train_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(train_labels, train_predictions)
print(f"Training Accuracy: {accuracy:.4f}")
print(f"Training Precision: {precision:.4f}")
print(f"Training Recall: {recall:.4f}")
print(f"Training F1-score: {f1:.4f}")

valid_embeddings, valid_labels = get_embeddings_and_labels_sif(valid_data, glove_embeddings)
valid_predictions = model.predict(valid_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(valid_labels, valid_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-score: {f1:.4f}")

test_embeddings, test_labels = get_embeddings_and_labels_sif(test_data, glove_embeddings)
test_predictions = model.predict(test_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(test_labels, test_predictions)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")

Training Accuracy: 0.5667
Training Precision: 0.5655
Training Recall: 0.5754
Training F1-score: 0.5704
Validation Accuracy: 0.3500
Validation Precision: 0.3541
Validation Recall: 0.3640
Validation F1-score: 0.3590
Test Accuracy: 0.3555
Test Precision: 0.3607
Test Recall: 0.3740
Test F1-score: 0.3672


In [22]:
train_embeddings, train_labels = get_embeddings_and_labels_tfidf(train_data, glove_embeddings)
model = LogisticRegression(
    C=10,
    penalty='l1',
    solver='liblinear',
    max_iter=500,
    tol=1e-5,
    class_weight='balanced',
    random_state=42
)
model.fit(train_embeddings, train_labels)

train_predictions = model.predict(train_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(train_labels, train_predictions)
print(f"Training Accuracy: {accuracy:.4f}")
print(f"Training Precision: {precision:.4f}")
print(f"Training Recall: {recall:.4f}")
print(f"Training F1-score: {f1:.4f}")

valid_embeddings, valid_labels = get_embeddings_and_labels_tfidf(valid_data, glove_embeddings)
valid_predictions = model.predict(valid_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(valid_labels, valid_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-score: {f1:.4f}")

test_embeddings, test_labels = get_embeddings_and_labels_tfidf(test_data, glove_embeddings)
test_predictions = model.predict(test_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(test_labels, test_predictions)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")

Training Accuracy: 0.5657
Training Precision: 0.5654
Training Recall: 0.5680
Training F1-score: 0.5667
Validation Accuracy: 0.3760
Validation Precision: 0.3789
Validation Recall: 0.3880
Validation F1-score: 0.3834
Test Accuracy: 0.3500
Test Precision: 0.3521
Test Recall: 0.3570
Test F1-score: 0.3545


It doesn't seem like the score is improving with logistic regression hyper parameters fine tuning. Lets use SVM instead of logistic regression and see if it helps.

In [23]:
train_embeddings = []
train_labels = []
for example in train_data:
    phrase1_embedding = get_phrase_embedding(example['phrase1'], glove_embeddings)
    phrase2_embedding = get_phrase_embedding(example['phrase2'], glove_embeddings)
    embedding = np.concatenate((phrase1_embedding, phrase2_embedding))
    train_embeddings.append(embedding)
    train_labels.append(example['label'])

model = SVC(
    C=1.0,
    kernel='sigmoid',
    degree=3,
    gamma='auto',
    coef0=0.0,
    shrinking=True,
    probability=False,
    tol=1e-3,
    cache_size=200,
    class_weight='balanced',
    verbose=False,
    max_iter=-1,
    decision_function_shape='ovr',
    break_ties=False,
    random_state=42
)
model.fit(train_embeddings, train_labels)

train_predictions = model.predict(train_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(train_labels, train_predictions)
print(f"Training Accuracy: {accuracy:.4f}")
print(f"Training Precision: {precision:.4f}")
print(f"Training Recall: {recall:.4f}")
print(f"Training F1-score: {f1:.4f}")

valid_embeddings, valid_labels = get_embeddings_and_labels(valid_data, glove_embeddings)
valid_predictions = model.predict(valid_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(valid_labels, valid_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-score: {f1:.4f}")

test_embeddings, test_labels = get_embeddings_and_labels(test_data, glove_embeddings)
test_predictions = model.predict(test_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(test_labels, test_predictions)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")

Training Accuracy: 0.5293
Training Precision: 0.5588
Training Recall: 0.2781
Training F1-score: 0.3714
Validation Accuracy: 0.4460
Validation Precision: 0.3929
Validation Recall: 0.1980
Validation F1-score: 0.2633
Test Accuracy: 0.4335
Test Precision: 0.3588
Test Recall: 0.1690
Test F1-score: 0.2298


In [24]:
train_embeddings, train_labels = get_embeddings_and_labels_sif(valid_data, glove_embeddings)

model = SVC(
    C=1.0,
    kernel='sigmoid',
    degree=3,
    gamma='auto',
    coef0=0.0,
    shrinking=True,
    probability=False,
    tol=1e-3,
    cache_size=200,
    class_weight='balanced',
    verbose=False,
    max_iter=-1,
    decision_function_shape='ovr',
    break_ties=False,
    random_state=42
)
model.fit(train_embeddings, train_labels)

train_predictions = model.predict(train_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(train_labels, train_predictions)
print(f"Training Accuracy: {accuracy:.4f}")
print(f"Training Precision: {precision:.4f}")
print(f"Training Recall: {recall:.4f}")
print(f"Training F1-score: {f1:.4f}")

valid_embeddings, valid_labels = get_embeddings_and_labels_sif(valid_data, glove_embeddings)
valid_predictions = model.predict(valid_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(valid_labels, valid_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-score: {f1:.4f}")

test_embeddings, test_labels = get_embeddings_and_labels_sif(test_data, glove_embeddings)
test_predictions = model.predict(test_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(test_labels, test_predictions)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")

Training Accuracy: 0.6550
Training Precision: 0.6264
Training Recall: 0.7680
Training F1-score: 0.6900
Validation Accuracy: 0.6550
Validation Precision: 0.6264
Validation Recall: 0.7680
Validation F1-score: 0.6900
Test Accuracy: 0.4925
Test Precision: 0.4937
Test Recall: 0.5910
Test F1-score: 0.5380


In [25]:
train_embeddings = []
train_labels = []
for example in train_data:
    phrase1_embedding = get_phrase_embedding_tfidf(example['phrase1'], glove_embeddings, tfidf_scores)
    phrase2_embedding = get_phrase_embedding_tfidf(example['phrase2'], glove_embeddings, tfidf_scores)
    embedding = np.concatenate((phrase1_embedding, phrase2_embedding))
    train_embeddings.append(embedding)
    train_labels.append(example['label'])

model = SVC(
    C=1.0,
    kernel='sigmoid',
    degree=3,
    gamma='auto',
    coef0=0.0,
    shrinking=True,
    probability=False,
    tol=1e-3,
    cache_size=200,
    class_weight='balanced',
    verbose=False,
    max_iter=-1,
    decision_function_shape='ovr',
    break_ties=False,
    random_state=42
)
model.fit(train_embeddings, train_labels)

train_predictions = model.predict(train_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(train_labels, train_predictions)
print(f"Training Accuracy: {accuracy:.4f}")
print(f"Training Precision: {precision:.4f}")
print(f"Training Recall: {recall:.4f}")
print(f"Training F1-score: {f1:.4f}")

valid_embeddings, valid_labels = get_embeddings_and_labels_tfidf(valid_data, glove_embeddings)
valid_predictions = model.predict(valid_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(valid_labels, valid_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-score: {f1:.4f}")

test_embeddings, test_labels = get_embeddings_and_labels_tfidf(test_data, glove_embeddings)
test_predictions = model.predict(test_embeddings)
accuracy, precision, recall, f1 = get_eval_metrics(test_labels, test_predictions)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")

Training Accuracy: 0.5290
Training Precision: 0.5642
Training Recall: 0.2547
Training F1-score: 0.3510
Validation Accuracy: 0.4460
Validation Precision: 0.3795
Validation Recall: 0.1700
Validation F1-score: 0.2348
Test Accuracy: 0.4305
Test Precision: 0.3395
Test Recall: 0.1470
Test F1-score: 0.2052


An SVM classifier did not help either. Lets go for a little more complex model using a simple neural network

In [26]:
train_embeddings, train_labels = get_embeddings_and_labels_sif(train_data, glove_embeddings)
valid_embeddings, valid_labels = get_embeddings_and_labels_sif(valid_data, glove_embeddings)



METRICS = [
    tf.keras.metrics.AUC(name='roc-auc'),
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name="recall"),
    tfa.metrics.F1Score(name='f1_score', threshold=0.5, num_classes=1)  # Add F1 score
]

model = Sequential([
    Dense(64, activation='relu', input_shape=(train_embeddings.shape[1],)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

history = model.fit(train_embeddings, train_labels, epochs=10,
                    validation_data=(valid_embeddings, valid_labels))

model.evaluate(valid_embeddings, valid_labels)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.9253295063972473,
 0.11752200126647949,
 0.1940000057220459,
 0.197628453373909,
 0.20000000298023224,
 array([0.19880715], dtype=float32)]

In [27]:
train_embeddings, train_labels = get_embeddings_and_labels_tfidf(train_data, glove_embeddings)
valid_embeddings, valid_labels = get_embeddings_and_labels_tfidf(valid_data, glove_embeddings)


METRICS = [
    tf.keras.metrics.AUC(name='roc-auc'),
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name="recall"),
    tfa.metrics.F1Score(name='f1_score', threshold=0.5, num_classes=1)
]

model = Sequential([
    Dense(64, activation='relu', input_shape=(train_embeddings.shape[1],)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

history = model.fit(train_embeddings, train_labels, epochs=10,
                    validation_data=(valid_embeddings, valid_labels))

model.evaluate(valid_embeddings, valid_labels)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.8765649795532227,
 0.1577719897031784,
 0.24199999868869781,
 0.16923077404499054,
 0.13199999928474426,
 array([0.1483146], dtype=float32)]

In [28]:
train_embeddings, train_labels = get_embeddings_and_labels(train_data, glove_embeddings)
valid_embeddings, valid_labels = get_embeddings_and_labels(valid_data, glove_embeddings)


METRICS = [
    tf.keras.metrics.AUC(name='roc-auc'),
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name="recall"),
    tfa.metrics.F1Score(name='f1_score', threshold=0.5, num_classes=1)
]

model = Sequential([
    Dense(64, activation='relu', input_shape=(train_embeddings.shape[1],)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

history = model.fit(train_embeddings, train_labels, epochs=10,
                    validation_data=(valid_embeddings, valid_labels))

model.evaluate(valid_embeddings, valid_labels)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.8922796845436096,
 0.15143799781799316,
 0.23600000143051147,
 0.26428571343421936,
 0.29600000381469727,
 array([0.2792453], dtype=float32)]

In [30]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

In [31]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name)

def prepare_input(phrase1, phrase2):
    inputs = tokenizer(phrase1, phrase2, return_tensors='tf', padding=True, truncation=True, max_length=128)
    return inputs

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
example_inputs = prepare_input("this is awesome", "this is terrible")

In [None]:
type(example_inputs)

transformers.tokenization_utils_base.BatchEncoding

In [91]:
def predict_label(phrase1, phrase2):
    inputs = prepare_input(phrase1, phrase2)
    outputs = model(inputs)
    logits = outputs.logits
    predicted_label = tf.argmax(logits, axis=1).numpy()[0]
    return predicted_label
def compute_accuracy(data):
    correct_predictions = 0
    total_samples = len(data['label'])

    for i in range(total_samples):
        phrase1 = data['phrase1'][i]
        phrase2 = data['phrase2'][i]
        true_label = data['label'][i]
        predicted_label = predict_label(phrase1, phrase2)
        if predicted_label == true_label:
            correct_predictions += 1

    accuracy = correct_predictions / total_samples
    return accuracy
train_accuracy = compute_accuracy(train_data)
valid_accuracy = compute_accuracy(valid_data)
test_accuracy = compute_accuracy(test_data)

print(f"Training accuracy: {train_accuracy:.2f}")
print(f"Validation accuracy: {valid_accuracy:.2f}")
print(f"Test accuracy: {test_accuracy:.2f}")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training accuracy: 0.50
Validation accuracy: 0.49
Test accuracy: 0.48


In [97]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [98]:
def calculate_similarity(phrase1, phrase2):
    doc1 = nlp(phrase1)
    doc2 = nlp(phrase2)
    similarity = doc1.similarity(doc2)
    return similarity

def calculate_accuracy(data):
    correct_predictions = 0
    total_examples = len(data)

    for example in data:
        phrase1 = example['phrase1']
        phrase2 = example['phrase2']
        label = example['label']

        similarity = calculate_similarity(phrase1, phrase2)
        predicted_label = 1 if similarity >= 0.5 else 0

        if predicted_label == label:
            correct_predictions += 1

    accuracy = correct_predictions / total_examples
    return accuracy

train_accuracy = calculate_accuracy(train_data)
valid_accuracy = calculate_accuracy(valid_data)
test_accuracy = calculate_accuracy(test_data)

print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Validation Accuracy: {valid_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

  similarity = doc1.similarity(doc2)


Train Accuracy: 0.50
Validation Accuracy: 0.50
Test Accuracy: 0.52
