<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Vectorización


In [88]:
import numpy as np
import pandas as pd


In [89]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [90]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])
corpus_2 = np.array(['el dia de hoy fue agotador ', 'hoy es un dia cansador y un dia agotador', 'tengo ganas de comer una pizza', 'el dia de hoy voy a descansar','hoy es un dia de lluvia'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [91]:
def vocabulario(text_list):
  #Elaboro un bucle for para extraer cada documento del corpus
  vocabulario = []
  for i in range(0,len(text_list)):
    vocabulario_i = text_list[i]
    vocabulario.append(vocabulario_i)

  #Elaboro un bucle for para extraer cada termino de cada documento del corpus
  terminos =[]
  for i in range(0,len(text_list)):
    vocabulario_i = text_list[i]
    terminos_i = np.unique(np.char.split(vocabulario_i).tolist())
    terminos.append(terminos_i)
    print(f"Los términos del documento {i} son :{terminos_i}")
  return terminos

In [92]:
terminos = vocabulario(corpus_2)

Los términos del documento 0 son :['agotador' 'de' 'dia' 'el' 'fue' 'hoy']
Los términos del documento 1 son :['agotador' 'cansador' 'dia' 'es' 'hoy' 'un' 'y']
Los términos del documento 2 son :['comer' 'de' 'ganas' 'pizza' 'tengo' 'una']
Los términos del documento 3 son :['a' 'de' 'descansar' 'dia' 'el' 'hoy' 'voy']
Los términos del documento 4 son :['de' 'dia' 'es' 'hoy' 'lluvia' 'un']


In [93]:
def obtener_terminos_unicos(text_list):
  terminos =[]
  for i in range(0,len(text_list)):
    vocabulario_i = text_list[i]
    terminos_i = np.unique(np.char.split(vocabulario_i).tolist())
    terminos.append(terminos_i)

  terminos_gral = np.concatenate((terminos))
  terminos_unicos = np.unique(terminos_gral)

  return terminos_unicos

In [94]:
terminos_unicos = obtener_terminos_unicos(corpus_2)
terminos_unicos

array(['a', 'agotador', 'cansador', 'comer', 'de', 'descansar', 'dia',
       'el', 'es', 'fue', 'ganas', 'hoy', 'lluvia', 'pizza', 'tengo',
       'un', 'una', 'voy', 'y'], dtype='<U9')

### 2- OneHot encoding
Dada una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [95]:
def one_hot_representation(text_list):
  terminos_unicos = obtener_terminos_unicos(text_list)
  #Genero diccionarios para extraer los índices de cada lista de términos únicos
  indices_general = {indice:i for i, indice in enumerate(terminos_unicos)}

  #Genero matriz de base para el one hot encoding
  one_hot_matriz = np.zeros((len(text_list),len(terminos_unicos))).astype(int)

  for i, terminos in enumerate(text_list):
    palabras = terminos.split()
    for palabra in palabras:
        if palabra in indices_general:
          j = indices_general[palabra]
          one_hot_matriz[i,j] = 1
  df = pd.DataFrame(one_hot_matriz, columns=terminos_unicos)
  return df

In [96]:
one_hot = one_hot_representation(corpus_2)
one_hot

Unnamed: 0,a,agotador,cansador,comer,de,descansar,dia,el,es,fue,ganas,hoy,lluvia,pizza,tengo,un,una,voy,y
0,0,1,0,0,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0
1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1
2,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,1,0,0
3,1,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,1,0,0,0


### 3- Vectores de frecuencia
Dada una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [97]:
def frecuency_representation(text_list):
  #Genero diccionarios para extraer los índices de cada lista de términos únicos
  terminos_unicos = obtener_terminos_unicos(text_list)
  indices_general = {indice:i for i, indice in enumerate(terminos_unicos)}

  #Genero matriz de base para el conteo de frecuencias
  frecuency_matriz = np.zeros((len(text_list),len(terminos_unicos))).astype(int)

  for i, terminos in enumerate(text_list):
    palabras = terminos.split()
    for palabra in palabras:
        if palabra in indices_general:
          j = indices_general[palabra]
          frecuency_matriz[i,j] += 1
  df = pd.DataFrame(frecuency_matriz, columns=terminos_unicos)
  return df

In [98]:
frecuency_matriz = frecuency_representation(corpus_2)
frecuency_matriz

Unnamed: 0,a,agotador,cansador,comer,de,descansar,dia,el,es,fue,ganas,hoy,lluvia,pizza,tengo,un,una,voy,y
0,0,1,0,0,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0
1,0,1,1,0,0,0,2,0,1,0,0,1,0,0,0,2,0,0,1
2,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,1,0,0
3,1,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,1,0,0,0


### 4- TF-IDF
Dada una lista de textos, devolver una matriz con la representacion TFIDF

In [99]:
def TF_IDF_representation(text_list):

  #Obtengo terminos únicos
  terminos_unicos = obtener_terminos_unicos(text_list)

  #Calculo el DF a partir de la matriz de one_hot
  one_hot = one_hot_representation(text_list)
  count = np.sum(one_hot,axis=0).to_numpy().reshape(1,-1)
  DF = count

  #Calculo la longitud del corpus
  N = len(text_list)

  #Calculo IDF
  IDF = np.log10(N/DF)

  #Calculo matriz de frecuencias
  frecuency_matriz = frecuency_representation(text_list)

  TF_IDF = frecuency_matriz*IDF

  df = pd.DataFrame(TF_IDF,columns=terminos_unicos)

  return df

In [100]:
tf_idf =  TF_IDF_representation(corpus_2)
tf_idf

Unnamed: 0,a,agotador,cansador,comer,de,descansar,dia,el,es,fue,ganas,hoy,lluvia,pizza,tengo,un,una,voy,y
0,0.0,0.39794,0.0,0.0,0.09691,0.0,0.09691,0.39794,0.0,0.69897,0.0,0.09691,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.39794,0.69897,0.0,0.0,0.0,0.19382,0.0,0.39794,0.0,0.0,0.09691,0.0,0.0,0.0,0.79588,0.0,0.0,0.69897
2,0.0,0.0,0.0,0.69897,0.09691,0.0,0.0,0.0,0.0,0.0,0.69897,0.0,0.0,0.69897,0.69897,0.0,0.69897,0.0,0.0
3,0.69897,0.0,0.0,0.0,0.09691,0.69897,0.09691,0.39794,0.0,0.0,0.0,0.09691,0.0,0.0,0.0,0.0,0.0,0.69897,0.0
4,0.0,0.0,0.0,0.0,0.09691,0.0,0.09691,0.0,0.39794,0.0,0.0,0.09691,0.69897,0.0,0.0,0.39794,0.0,0.0,0.0


### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [101]:
def similarity_function(indice,text_list):

  #Defino la posición del documento del corpus a comparar con los demas documentos
  i = indice

  #Calculo la TF_IDF de todo el corpus y lo convierto a un array de Numpy con tantas filas como documentos y columnas como términos únicos del corpus
  tf_idf =  TF_IDF_representation(text_list).to_numpy()
  tf_idf = tf_idf.reshape(len(text_list),tf_idf.shape[1])

  #Selecciono el vector TF_IDF correspondiente al documento a analizar
  tf_idf_select = tf_idf[i]

  #Inicializo la lista de las similitudes coseno y de los documentos (opraciones) comparadas en cada iteración en el bucle for
  cosine_list = []
  oraciones = []

  for j in range(0,len(text_list)):
    tf_idf_j = tf_idf[j]
    simil = cosine_similarity(tf_idf_select,tf_idf_j)
    cosine_list.append(simil)
    oraciones.append(text_list[j])

  #Genero un dataframe de Pandas que contenga la similitud coseno del documento "i" con cada uno de los otros documentos del corpus.
  data= {'cosine_similarity':cosine_list,'oración':oraciones}
  df = pd.DataFrame(data)
  df = df.drop(i)
  df = df.sort_values(by='cosine_similarity',ascending=False)
  return df

In [102]:
similarity = similarity_function(1,corpus_2)
similarity

Unnamed: 0,cosine_similarity,oración
4,0.392322,hoy es un dia de lluvia
0,0.145417,el dia de hoy fue agotador
3,0.0156,el dia de hoy voy a descansar
2,0.0,tengo ganas de comer una pizza
