SBERT

In [None]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as english_stop_words
import numpy as np
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', "attribute_ruler"])
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from sentence_transformers import SentenceTransformer

In [None]:
pip install -U sentence-transformers

In [6]:
train = pd.read_csv('train_responses.csv')
dev = pd.read_csv('dev_responses.csv')

smoothingfunction = SmoothingFunction()

In [None]:
class sbert():

  def __init__(self):
      self.train = pd.read_csv('train_responses.csv')
      self.dev = pd.read_csv('dev_responses.csv')

  def clean_and_join(self, doc):
    clean_tokens = [token.text.lower() for token in doc if not token.is_punct]
    clean_sentence = ' '.join(clean_tokens)
    return clean_sentence

  def lemma(self, data):
    user_propmts = list(data['user_prompt'])
    lemmatized_propmts = list(nlp.pipe(user_propmts))
    lemmatized_propmts = [self.clean_and_join(prompt) for prompt in lemmatized_propmts]
    return lemmatized_propmts

  def for_embed_delete_it_later(self):
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    train_vectors = model.encode(self.lemma(self.train))
    dev_vectors = model.encode(self.lemma(self.dev))
    return train_vectors, dev_vectors

  def best_answer(self):
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    train_vectors = model.encode(self.lemma(self.train))
    dev_vectors = model.encode(self.lemma(self.dev))
    modified_dev = self.dev.copy()
    for i in range(len(dev)):
        given_prompt_tfidf = dev_vectors[i]
        given_prompt_tfidf = np.array(given_prompt_tfidf).reshape(1, -1)
        train_vectors = np.array(train_vectors).reshape(len(train_vectors), -1)
        cosine_similarities = cosine_similarity(given_prompt_tfidf, train_vectors)
        most_similar_prompt_index = cosine_similarities.argmax()
        most_similar_prompt = train.loc[most_similar_prompt_index, 'user_prompt']
        most_similar_answer = train.loc[most_similar_prompt_index, 'model_response']
        modified_dev.loc[i,'retrieved_response'] = most_similar_answer
    return modified_dev

  def score(self):
    modified_dev = self.best_answer()
    modified_dev['model_response'] = modified_dev['model_response'].astype(str)
    modified_dev['retrieved_response'] = modified_dev['retrieved_response'].astype(str)
    modified_dev['bleu_score'] = modified_dev.apply(lambda x: sentence_bleu([x['model_response'].split()], x['retrieved_response'].split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothingfunction.method3), axis=1)
    return np.mean(modified_dev['bleu_score'])

In [None]:
MODEL = sbert()

In [None]:
MODEL.score()



0.09913395882522728

Discrete Text Representation

In [None]:
class Doc2VecEstimator():

    def __init__(self,vector_size=300, window=5, min_count=5, epochs=500, dbow_words = 0, dm = 1 , hs=0, negative = 10):
      self.vector_size = vector_size
      self.window = window
      self.min_count = min_count
      self.epochs = epochs
      self.dbow_words = dbow_words
      self.dm = dm
      self.hs = hs
      self.negative = negative
      self.model = None
      self.train = pd.read_csv('train_responses.csv')
      self.dev = pd.read_csv('dev_responses.csv')

    def clean_and_join(self, doc):
      clean_tokens = [token.lemma_.lower() for token in doc if not token.is_punct]
      clean_sentence = ' '.join(clean_tokens)
      return clean_sentence

    def tagged(self, data):
      user_propmts = list(data['user_prompt'])
      lemmatized_propmts = list(nlp.pipe(user_propmts))
      lemmatized_propmts = [self.clean_and_join(prompt) for prompt in lemmatized_propmts]
      tagged_documents = [TaggedDocument(words=doc.split(), tags=[str(idx)]) for idx, doc in enumerate(lemmatized_propmts)]
      return tagged_documents

    def fit(self, train, dev):
      self.model = Doc2Vec(
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            epochs=self.epochs,
            dbow_words=self.dbow_words,
            dm=self.dm,
            hs=self.hs,
            negative=self.negative)
      tagged_documents = self.tagged(train)
      self.model.build_vocab(tagged_documents)
      total_examples = len(tagged_documents)
      self.model.train(tagged_documents, total_examples=total_examples,  epochs=self.epochs)
      train_vectors = [self.model.dv[x] for x in range(len(self.tagged(train)))]
      dev_vectors = [self.model.infer_vector(doc.words) for doc in self.tagged(dev)]

      print("Model Parameters:")
      print(f"Vector Size: {self.model.vector_size}")
      print(f"Window: {self.model.window}")
      print(f"HS: {self.model.hs}")
      print(f"Sample: {self.model.sample}")
      print(f"Negative: {self.model.negative}")
      print(f"Min Count: {self.model.min_count}")
      print(f"Workers: {self.model.workers}")
      print(f"Epochs: {self.model.epochs}")
      print(f"DM: {self.model.dm}")
      print(f"DBOW Words: {self.model.dbow_words}")

      return train_vectors, dev_vectors

    def best_answer(self):
      train_vectors, dev_vectors = self.fit(train, dev)
      modified_dev = dev.copy()
      for i in range(len(dev)):
        given_prompt_tfidf = dev_vectors[i]
        given_prompt_tfidf = np.array(given_prompt_tfidf).reshape(1, -1)
        train_vectors = np.array(train_vectors).reshape(len(train_vectors), -1)
        cosine_similarities = cosine_similarity(given_prompt_tfidf, train_vectors)
        most_similar_prompt_index = cosine_similarities.argmax()
        most_similar_prompt = train.loc[most_similar_prompt_index, 'user_prompt']
        most_similar_answer = train.loc[most_similar_prompt_index, 'model_response']
        modified_dev.loc[i,'retrieved_response'] = most_similar_answer
      return modified_dev

    def score(self):
      modified_dev = self.best_answer()
      modified_dev['model_response'] = modified_dev['model_response'].astype(str)
      modified_dev['retrieved_response'] = modified_dev['retrieved_response'].astype(str)
      modified_dev['bleu_score'] = modified_dev.apply(lambda x: sentence_bleu([x['model_response'].split()], x['retrieved_response'].split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothingfunction.method3), axis=1)
      print(np.mean(modified_dev['bleu_score']))
      return np.mean(modified_dev['bleu_score'])

In [None]:
estimator = Doc2VecEstimator(vector_size=300, window=2, min_count=3, epochs=500, dbow_words = 0, dm = 0 , hs=0, negative = 10)
estimator.score()

Static text representation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
class tfidf_vect():

  def __init__(self, train_responses, dev_responses):
    self.train = train_responses
    self.dev = dev_responses

  def clean_and_join(self, doc):
    clean_tokens = [token.text.lower() for token in doc if not token.is_punct]
    clean_sentence = ' '.join(clean_tokens)
    return clean_sentence

  def lemmatized_prompts(self, data):
    user_propmts = list(data['user_prompt'])
    lemmatized_propmts = list(nlp.pipe(user_propmts))
    lemmatized_propmts = [clean_and_join(prompt) for prompt in lemmatized_propmts]
    return lemmatized_propmts

  def tfidf(self, train, dev):
    tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                   sublinear_tf=True,
                                   ngram_range=(1, 2), norm = 'l1', smooth_idf = True)

    train_tfidf = tfidf_vectorizer.fit_transform(self.lemmatized_prompts(train))
    dev_tfidf = tfidf_vectorizer.transform(self.lemmatized_prompts( dev))
    return train_tfidf, dev_tfidf

  def blue_score(self, train, dev):
    train_tfidf, dev_tfidf = self.tfidf(train, dev)
    for i in range(len(dev)):
      given_prompt_tfidf = dev_tfidf[i]
      cosine_similarities = cosine_similarity(given_prompt_tfidf, train_tfidf)
      most_similar_prompt_index = cosine_similarities.argmax()
      most_similar_answer = train.loc[most_similar_prompt_index, 'model_response']
      dev.loc[i,'retrieved_response'] = most_similar_answer

    dev['model_response'] = dev['model_response'].astype(str)
    dev['retrieved_response'] = dev['retrieved_response'].astype(str)
    dev['bleu_score'] = dev.apply(lambda x: sentence_bleu([x['model_response'].split()], x['retrieved_response'].split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothingfunction.method3), axis=1)

    print(f'Using cosine similarities blue score is {dev["bleu_score"].mean()}')


    for i in range(len(dev)):
        given_prompt_tfidf = dev_tfidf[i]
        cosine_similarities = euclidean_distances(given_prompt_tfidf, train_tfidf)
        most_similar_prompt_index = cosine_similarities.argmin()
        most_similar_answer = train.loc[most_similar_prompt_index, 'model_response']
        dev.loc[i,'retrieved_response'] = most_similar_answer


    dev['model_response'] = dev['model_response'].astype(str)
    dev['retrieved_response'] = dev['retrieved_response'].astype(str)
    dev['bleu_score'] = dev.apply(lambda x: sentence_bleu([x['model_response'].split()], x['retrieved_response'].split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothingfunction.method3), axis=1)

    print(f'Using eucledian distance blue score is {dev["bleu_score"].mean()}')

In [None]:
MODEL = tfidf_vect(train, dev)
MODEL.blue_score(train,dev)