In [2]:
#!pip install sentence_transformers
#!pip install gdown

In [6]:
# Download sentiment datasets (Stanford Sentiment Treebank v2, train and dev).
#!gdown "https://drive.google.com/uc?id=1j6GNle4N6dIZ06r8yHFqW2FEMmKrxVbW"
#!gdown "https://drive.google.com/uc?id=1WDFTmif6S0boBQf4qR0jy2M_QaywO9QO"

In [4]:
# Setup.
import codecs
import numpy as np
from sentence_transformers import SentenceTransformer

# Load sentence embeddings model.
# See: https://www.aclweb.org/anthology/D19-1410.pdf
embed_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Function that gets the embedding from a sentence.
def embedding_fn(sentence):
    return embed_model.encode([sentence])[0]

# Read datasets.
def read_dataset(filepath, n=100):
    print("Reading sentences.")
    infile = codecs.open(filepath, 'rb', encoding='utf-8')
    sentences = []
    labels = []
    embeddings = []
    line_count = 0
    for line in infile:
        line_count += 1
        if line_count == 1:
            continue # Skip header
        split = line.split("\t")
        sentences.append(split[0])
        labels.append(int(split[1])) # Note: 1 is positive, 0 is negative.
    print("Embedding sentences.")
    embeddings = embed_model.encode(sentences[0:n])
    labels = np.stack(labels[0:n])
    return embeddings, labels
  
X_train, Y_train = read_dataset("train.tsv", n=500)
X_dev, Y_dev = read_dataset("dev.tsv", n=50)


100%|████████████████████████████████████████████████████████████████████████████████| 405M/405M [07:59<00:00, 845kB/s]


Reading sentences.
Embedding sentences.
Reading sentences.
Embedding sentences.


In [7]:
#use cosine similarity due to high-dimensionality, euclidean does not work as well
print(X_train.shape)

(500, 768)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def knn(X_train, Y_train, x_dev, k=10):
    similarities = cosine_similarity([x_dev], X_train)
    sorted_indices = np.argsort(similarities[0], axis=-1)
    sorted_indices = sorted_indices[-k:]
    near_labels = Y_train[sorted_indices]
    num_positive = np.sum(near_labels)
    proportion_positive = num_positive/k
    print(num_positive)
    return 1 if proportion_positive > 0.5 else 0

def knn_batch(X_train, Y_train, X_dev, k=10):
    # Shape: n_dev x n_train (50,500)
    similarities = cosine_similarity(X_dev, X_train)
    # Shape: n_dev x n_train (50,500)
    sorted_indices = np.argsort(similarities, axis=-1)
    sorted_indices = sorted_indices[:,-k:]
    near_labels = Y_train[sorted_indices]
    num_positive = np.sum(near_labels, axis=-1)
    proportion_positive = num_positive/k
    predictions = proportion_positive > 0.5 
    return predictions
  

# Get dev accuracy:
dev_predictions = knn_batch(X_train, Y_train, X_dev, k=10)
dev_accuracy = np.average(dev_predictions == Y_dev)
print("Dev accuracy: {}".format(dev_accuracy))

# Test your own sentence:
sentence = "I like the food but I dislike the atmosphere"
prediction = knn(X_train, Y_train, embedding_fn(sentence), k=10)
print("Sentence: {}".format(sentence))
print("Label: {}".format("positive" if prediction else "negative"))


Dev accuracy: 0.92
1
Sentence: I like the food but I dislike the atmosphere
Label: negative
