# Class for text vectorization
In this notebook, we create a class for creating embeddings using different techniques. 

To test our functions, we'll use the MeDAL dataset. This dataset is available in [Hugging Face](https://huggingface.co/datasets/medal) which was published in the [paper](https://aclanthology.org/2020.clinicalnlp-1.15/).

In [None]:
# install libraries
!pip install datasets
!pip install wget
!pip install transformers
!pip install sentence-transformers

In [None]:
# load libraries
import wget
import pandas as pd
import numpy as np
import transformers
import gensim.downloader as api
from datasets import load_dataset
from gensim.models import Word2Vec,KeyedVectors
from gensim.utils import simple_preprocess
from sentence_transformers import SentenceTransformer


In [None]:
# load data containing text data
dataset = load_dataset("wikitext", 'wikitext-103-v1')

In [None]:
# load the word2vec model
wv = api.load('word2vec-google-news-300')

In [None]:
# load the transformers model
model = SentenceTransformer('bert-base-uncased')

# Main Class

In [None]:
from IPython.terminal import embed
class Create_embedding:
    """
    A class used to convert text into embeddings
    using various techniques.

    Attributes
    ----------
    text : str
        text for which we want the embedding.
    path_to_model : str
        pass a model object or 
        path to the dir where a custom model is saved or name of model.
        All possible hugging face models can be found here : https://huggingface.co/models?library=sentence-transformers&sort=downloads

    Methods
    -------
    word_2_vec
        Return word2vec embeddings for the text.
    transformers
        Returner emedding by any transformer in huggingface

    """
    def __init__(self, text, path_to_model=None):
      self.text = text
      self.path_to_model = path_to_model
      self.clean_text = self.preprocess()

    def __str__(self):
      return "This class will create word embedding for:\n"+str(self.text[0:100])

    def preprocess(self):
      self.clean_text = simple_preprocess(self.text)
      return self.clean_text
    
    def word_2_vec(self, model=None):
      # check if a model object is passed:
      if self.path_to_model.word_vec!=None:
        # get the vector
        self.vector = np.mean([self.path_to_model[word] for word in self.clean_text if word in self.path_to_model.vocab], axis=0)
      else:
        raise Exception("Model not found. Please load a word2vec model.")
        self.vector = np.mean([model[word] for word in self.clean_text if word in model.vocab], axis=0)
      return self.vector
      

    def transformers(self, model = None):
      '''
      inputs : mode_name, name of the model we want to use. Examples are : 'bert-base-uncased', 'paraphrase-MiniLM-L6-v2'..
      returns embeddings by a transformer of the text, default is bert, but other model can be specified by its name in hugginface
      or by the path_to_model class attribute
      '''
      if model == None:
        model  = 'bert-base-uncased'
      if self.path_to_model==None:
        model = SentenceTransformer(model)
      else:
        model = self.path_to_model
      embedding = model.encode(self.clean_text)
      embedding = np.mean(embedding, axis=0)
      return(embedding)



# Testing embedding class

## Word2Vec

In [None]:
# creating a dataframe with 10 sentences 
test_df = dataset['train']['text'][:500]
sentences = []
for index, i in enumerate(test_df):
  if len(i.split())>200:
    sentences.append(i)
# covert to pandas dataframe
test_df = pd.DataFrame({"Sno":list(range(35)),"Text":sentences})

In [None]:
example_text = test_df['Text'][5]
print (example_text)

 The majority of material created for previous games , such as the <unk> system and the design of maps , was carried over . Alongside this , improvements were made to the game 's graphics and some elements were expanded , such as map layouts , mission structure , and the number of playable units per mission . A part of this upgrade involved creating unique polygon models for each character 's body . In order to achieve this , the cooperative elements incorporated into the second game were removed , as they took up a large portion of memory space needed for the improvements . They also adjusted the difficulty settings and ease of play so they could appeal to new players while retaining the essential components of the series ' gameplay . The newer systems were decided upon early in development . The character designs were done by <unk> Honjou , who had worked on the previous Valkyria Chronicles games . When creating the Nameless Squad , Honjou was faced with the same problem he had had d

In [None]:
# define the instance of the class
c = Create_embedding(example_text, wv)

In [None]:
# call word2vec method
sent_emb = c.word_2_vec()

In [None]:
sent_emb[0:50]

array([ 0.04658379,  0.04597364,  0.03893401,  0.05313334, -0.0451308 ,
       -0.03555235,  0.02645404, -0.07672491,  0.04935056,  0.04654928,
       -0.01989292, -0.08623394, -0.00842675,  0.03092166, -0.06884254,
       -0.00228381,  0.01326396,  0.06409685, -0.01029554, -0.04073678,
       -0.0405631 ,  0.03694576, -0.05718748, -0.00582404,  0.03121588,
       -0.02216141, -0.07861641,  0.08075499,  0.00866464, -0.01616759,
       -0.03417229, -0.02988266, -0.00700652,  0.02845157,  0.03449544,
       -0.00767024,  0.02535533, -0.02963617,  0.04576784,  0.03250948,
        0.0963238 ,  0.00183121,  0.06422134,  0.03469758,  0.00072397,
       -0.06504219, -0.03479271,  0.01023298,  0.03527363,  0.02942501],
      dtype=float32)

## Transformer model

In [None]:
model = SentenceTransformer('bert-base-uncased')

In [None]:
## with pre loaded model
c = Create_embedding(example_text,model)
c.transformers()[0:50]

array([ 0.11990342, -0.07570083, -0.1506623 ,  0.01293704, -0.02102345,
       -0.1435671 ,  0.30146962, -0.09403347,  0.05861866, -0.26148862,
       -0.11637579, -0.00111108,  0.01206072,  0.12087645, -0.23695384,
       -0.16093141, -0.01133005,  0.10137323,  0.05568448,  0.12851872,
        0.15672764,  0.01832988,  0.10598295,  0.07400321,  0.10141755,
        0.22951035, -0.20914586, -0.10933466, -0.26989615,  0.07977872,
       -0.04152232, -0.1268066 , -0.04269395,  0.32250774, -0.1554826 ,
       -0.22229369,  0.04874475, -0.04411151, -0.3464809 , -0.0945673 ,
       -0.08143158, -0.0672645 ,  0.08026835, -0.07770447,  0.02715283,
       -0.09066958, -0.24938029,  0.04231898, -0.24726741,  0.22496547],
      dtype=float32)

In [None]:
## with model name
c = Create_embedding(example_text)
c.transformers('paraphrase-MiniLM-L6-v2')[0:50]

array([[-0.21894754,  0.4953562 ,  0.44741312, ..., -0.1363442 ,
        -0.09861054,  0.19872642],
       [ 0.09742697,  0.45453203,  0.5956033 , ..., -0.2156922 ,
        -0.30407894,  0.38588253],
       [ 0.7213939 ,  0.27405986,  0.59295493, ...,  0.6663704 ,
         0.08657774,  0.19664077],
       [ 0.43469286,  0.02567542,  0.18255733, ..., -0.41889527,
         0.98770744, -0.12975271],
       [ 0.22393198,  0.16315515,  0.08766361, ..., -0.06896546,
         0.48545623,  0.0731098 ]], dtype=float32)