<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Vectorización


In [120]:
import math
import numpy as np
import pandas as pd

In [121]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [122]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [123]:
def get_vocabulary(corpus: np.array) -> list[str]:
    vocabulary = list()
    for phrase in corpus:
        words = phrase.split(' ')
        for w in words:
            if w not in vocabulary:
                vocabulary.append(w)
    return vocabulary

In [124]:
vocabulary = get_vocabulary(corpus)

In [125]:
print(vocabulary)

['que', 'dia', 'es', 'hoy', 'martes', 'el', 'de', 'muchas', 'gracias']


### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [126]:
def one_hot_encoding(vocabulary: list[str], corpus: np.array) -> [[int]]:
    result = []
    for phrase in corpus:
        words = phrase.split(' ')
        phrase_encode = np.zeros(len(vocabulary))
        for w in words:
            phrase_encode[vocabulary.index(w)] = 1
        result.append(phrase_encode)
    return result

In [127]:
one_hot_encoding(vocabulary, corpus)

[array([1., 1., 1., 1., 0., 0., 0., 0., 0.]),
 array([0., 1., 1., 1., 1., 1., 1., 0., 0.]),
 array([0., 0., 0., 0., 1., 0., 0., 1., 1.])]

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [128]:
def count_vectorizer(vocabulary: list[str], corpus: np.array) -> [[int]]:
    result = []
    for phrase in corpus:
        words = phrase.split(' ')
        phrase_encode = np.zeros(len(vocabulary))
        for w in words:
            index = vocabulary.index(w)
            phrase_encode[index] = phrase_encode[index] + 1
        result.append(phrase_encode)
    return result

In [129]:
count_vectorizer(vocabulary, corpus)

[array([1., 1., 1., 1., 0., 0., 0., 0., 0.]),
 array([0., 1., 1., 1., 2., 1., 1., 0., 0.]),
 array([0., 0., 0., 0., 1., 0., 0., 1., 1.])]

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [130]:
def idf(vocabulary: list[str], corpus: np.array) -> [[float]]:
    count_vectorizer_result = count_vectorizer(vocabulary, corpus)
    n_documents = len(corpus)
    document_frequency = np.zeros(len(vocabulary))
    idf_result = count_vectorizer_result
    for result in count_vectorizer_result:
        for idx, num in enumerate(result):
            if num > 0:
                document_frequency[idx] = document_frequency[idx] + 1
    for r_idx, result in enumerate(count_vectorizer_result):
        for idx, num in enumerate(result):
            if num > 0:
                idf_result[r_idx][idx] = num * math.log10(n_documents/document_frequency[idx])
    return idf_result

In [131]:
idf(vocabulary, corpus)

[array([0.47712125, 0.17609126, 0.17609126, 0.17609126, 0.        ,
        0.        , 0.        , 0.        , 0.        ]),
 array([0.        , 0.17609126, 0.17609126, 0.17609126, 0.35218252,
        0.47712125, 0.47712125, 0.        , 0.        ]),
 array([0.        , 0.        , 0.        , 0.        , 0.17609126,
        0.        , 0.        , 0.47712125, 0.47712125])]

### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

(Le agregue la variable opcional method para elegir que metodo usar al compararlos)

In [132]:
def _compare_phrases(corpus: np.array, vocabulary: list[str], idx: int, method='idf') -> list[str, float]:
    comparisons = []
    match method:
        case 'idf':
            results = idf(vocabulary, corpus)
        case 'one_hot':
            results = one_hot_encoding(vocabulary, corpus)
        case 'count':
            results = count_vectorizer(vocabulary, corpus)
    phrase_to_compare = results[idx]
    for r_idx, r in enumerate(results):
        comparisons.append((corpus[r_idx], cosine_similarity(phrase_to_compare, r)))
    return comparisons

In [133]:
def compare_phrases(corpus: np.array, idx: int, method='idf') -> np.array:
    vocabulary = get_vocabulary(corpus)
    comparisons = _compare_phrases(corpus, vocabulary, idx, method)
    df = pd.DataFrame(comparisons, columns=['document', 'cosine_similarity'])
    return df.sort_values('cosine_similarity', ascending=False)

In [134]:
compare_phrases(corpus, 0, 'idf')

Unnamed: 0,document,cosine_similarity
0,que dia es hoy,1.0
1,martes el dia de hoy es martes,0.200342
2,martes muchas gracias,0.0


In [135]:
compare_phrases(corpus, 1, 'one_hot')

Unnamed: 0,document,cosine_similarity
1,martes el dia de hoy es martes,1.0
0,que dia es hoy,0.612372
2,martes muchas gracias,0.235702


In [136]:
compare_phrases(corpus, 2, 'count')

Unnamed: 0,document,cosine_similarity
2,martes muchas gracias,1.0
1,martes el dia de hoy es martes,0.3849
0,que dia es hoy,0.0
