In [None]:
!pip install flair
!pip install pandas
!pip install allennlp

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, ELMoEmbeddings


pd.set_option('display.width', 1000)

In [7]:
eval_data = pd.read_csv('CovidBERT Evaluation Dataset - Combined.csv')
eval_data.drop(columns=['Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5'], inplace=True)
eval_data.head(5)

Unnamed: 0,question_1,question_2,similar
0,What is a novel coronavirus?,What is a new coronavirus?,1
1,Why is the disease being called coronavirus di...,Why is the name of the disease coronavirus dis...,1
2,Why might someone blame or avoid individuals a...,What would be the reason to blame or avoid ind...,1
3,How can people help stop stigma related to COV...,What can be done to stop stigma related to COV...,1
4,What is the source of the virus?,Where does the virus come from?,1


In [None]:
def init_embeddings(embedding, embedding_type, pooling_type):

    # initialize the word embeddings
    if embedding_type == 'classic': 
        embeddings = [WordEmbeddings(embedding)]
    elif embedding_type == 'elmo':
        embeddings = [ELMoEmbeddings(embedding)]
    elif embedding_type == 'flair':
        glove_embedding = WordEmbeddings('glove')
        flair_embedding_forward = FlairEmbeddings('news-forward')
        flair_embedding_backward = FlairEmbeddings('news-backward')
        embeddings = [glove_embedding, flair_embedding_forward, flair_embedding_backward]

    # initialize the document embeddings, mode = mean
    document_embeddings = DocumentPoolEmbeddings(embeddings, fine_tune_mode='nonlinear', pooling=pooling_type)
    return document_embeddings

def compute_similarity(df, document_embeddings):

    # create an example sentence
    sentence1 = Sentence(df['question_1'])
    sentence2 = Sentence(df['question_2'])

    # embed the sentence with our document embedding
    document_embeddings.embed(sentence1)
    document_embeddings.embed(sentence2)

    # now check out the embedded sentence.
    sentence_embedding1 = sentence1.get_embedding().detach().numpy().tolist()
    sentence_embedding2 = sentence2.get_embedding().detach().numpy().tolist()

    score = cosine_similarity([sentence_embedding1], [sentence_embedding2])
    return score[0][0]

def word_embeddings(df, embedding, embedding_type):
    pooling = ['mean','min', 'max']
    for pooling_op in pooling:
        col_name = embedding + '_' + pooling_op
        document_embeddings = init_embeddings(embedding, embedding_type, pooling_op)
        df[col_name] = df.apply(lambda x: compute_similarity(x, document_embeddings), axis=1)
        print('Done with ' + pooling_op)
    return df

In [8]:
eval_data = word_embeddings(eval_data, 'glove', 'classic')
eval_data.head()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Unnamed: 0,question_1,question_2,similar,glove_mean,glove_min,glove_max
0,What is a novel coronavirus?,What is a new coronavirus?,1,0.969663,0.96232,0.953024
1,Why is the disease being called coronavirus di...,Why is the name of the disease coronavirus dis...,1,0.987822,0.97352,0.980981
2,Why might someone blame or avoid individuals a...,What would be the reason to blame or avoid ind...,1,0.985725,0.987596,0.979712
3,How can people help stop stigma related to COV...,What can be done to stop stigma related to COV...,1,0.985724,0.987656,0.98624
4,What is the source of the virus?,Where does the virus come from?,1,0.9369,0.901894,0.861867


In [9]:
eval_data = word_embeddings(eval_data, 'en-news', 'classic')
eval_data.head()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Unnamed: 0,question_1,question_2,similar,glove_mean,glove_min,glove_max,en-news_mean,en-news_min,en-news_max
0,What is a novel coronavirus?,What is a new coronavirus?,1,0.969663,0.96232,0.953024,0.986088,0.964997,0.961819
1,Why is the disease being called coronavirus di...,Why is the name of the disease coronavirus dis...,1,0.987822,0.97352,0.980981,0.994423,0.993324,0.988986
2,Why might someone blame or avoid individuals a...,What would be the reason to blame or avoid ind...,1,0.985725,0.987596,0.979712,0.992363,0.980096,0.981828
3,How can people help stop stigma related to COV...,What can be done to stop stigma related to COV...,1,0.985724,0.987656,0.98624,0.988362,0.981589,0.978043
4,What is the source of the virus?,Where does the virus come from?,1,0.9369,0.901894,0.861867,0.961313,0.926174,0.891326


In [19]:
eval_data = word_embeddings(eval_data, 'small', 'elmo')
eval_data.head()

Done with min
Done with max


Unnamed: 0,question_1,question_2,similar,glove_mean,glove_min,glove_max,en-news_mean,en-news_min,en-news_max,small_mean,small_min,small_max
0,What is a novel coronavirus?,What is a new coronavirus?,1,0.969663,0.96232,0.953024,0.986088,0.964997,0.961819,0.916189,0.965656,0.961327
1,Why is the disease being called coronavirus di...,Why is the name of the disease coronavirus dis...,1,0.987822,0.97352,0.980981,0.994423,0.993324,0.988986,0.939447,0.94704,0.950697
2,Why might someone blame or avoid individuals a...,What would be the reason to blame or avoid ind...,1,0.985725,0.987596,0.979712,0.992363,0.980096,0.981828,0.945514,0.94879,0.960128
3,How can people help stop stigma related to COV...,What can be done to stop stigma related to COV...,1,0.985724,0.987656,0.98624,0.988362,0.981589,0.978043,0.930974,0.946333,0.939386
4,What is the source of the virus?,Where does the virus come from?,1,0.9369,0.901894,0.861867,0.961313,0.926174,0.891326,0.811464,0.877028,0.864786


In [None]:
eval_data = word_embeddings(eval_data, 'flair', 'flair')
eval_data.head()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Done with mean


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
eval_data.rename(columns={'en-news_mean':'fasstext_mean',
                          'en-news_max':'fasstext_max',
                          'en-news_min':'fasstext_min',
                          'small_mean':'elmo_mean',
                          'small_max':'elmo_max',
                          'small_min':'elmo_min'}, inplace=True)

In [None]:
eval_data.to_csv('embedding_similarity.csv', index=False)