<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## TP 1 - Vectorización


#### Federico Otero <br>fede.e.otero@gmail.com<br>Cohorte 11

In [1]:
import numpy as np

### Datos

In [2]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [3]:
def get_vocabulary(corpus):
    vocabulary = set()
    docs = dict()
    for i,d in enumerate(corpus):
        _doc = d.split(' ')
        #_doc.sort()
        docs[i] = _doc
        vocabulary = vocabulary | set(_doc)
        _voc = list(vocabulary)
        _voc.sort()
    return _voc, docs

In [4]:
voc, docs = get_vocabulary(corpus)
voc

['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas', 'que']

In [5]:
docs

{0: ['que', 'dia', 'es', 'hoy'],
 1: ['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes'],
 2: ['martes', 'muchas', 'gracias']}

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [6]:
def one_hot_encoding_matrix(corpus):
    voc, docs = get_vocabulary(corpus)
    # get unique terms by document
    for d in docs.keys():
        docs[d] = set(docs[d])
    
    n_terms = len(voc)
    n_docs = (corpus.shape)[0]
    one_hot_encoding_matrix = np.zeros((n_docs,n_terms))
    for i,row in enumerate(one_hot_encoding_matrix):
        for j,col in enumerate(row):
            _doc = docs[i]
            one_hot_encoding_matrix[i,j]=1 if (voc[j] in list(_doc)) else 0
    return one_hot_encoding_matrix

In [7]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

In [8]:
ohe = one_hot_encoding_matrix(corpus)

In [9]:
print(f'Vocabulary {voc}')
print('-'*84)
print(f'One Hot Encoding Matrix: \n{ohe}')
print('-'*84)

Vocabulary ['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas', 'que']
------------------------------------------------------------------------------------
One Hot Encoding Matrix: 
[[0. 1. 0. 1. 0. 1. 0. 0. 1.]
 [1. 1. 1. 1. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 1. 1. 0.]]
------------------------------------------------------------------------------------


### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [10]:
def frequencies_matrix(corpus):
    voc, docs = get_vocabulary(corpus)
    n_terms = len(voc)
    n_docs = (corpus.shape)[0]

    freqs = np.zeros((n_docs,n_terms))
    for i,row in enumerate(freqs):
        for j,col in enumerate(row):
            _term = voc[j]
            _doc = docs[i]
            freq = _doc.count(_term)
            freqs[i,j]+=freq
    return freqs

In [11]:
freq_matrix = frequencies_matrix(corpus)

In [12]:
print(f'Vocabulary {voc}')
print('-'*84)
print(f'Frequency Matrix: \n{freq_matrix}')
print('-'*84)

Vocabulary ['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas', 'que']
------------------------------------------------------------------------------------
Frequency Matrix: 
[[0. 1. 0. 1. 0. 1. 0. 0. 1.]
 [1. 1. 1. 1. 0. 1. 2. 0. 0.]
 [0. 0. 0. 0. 1. 0. 1. 1. 0.]]
------------------------------------------------------------------------------------


### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [13]:
def get_idf_vector(corpus):
    voc, docs = get_vocabulary(corpus)
   
    n_terms = len(voc)
    n_docs = (corpus.shape)[0]
    idf_vector = np.zeros((n_terms))

    for i, e in enumerate(idf_vector):
        term=voc[i]
        df=0
        for n in range(n_docs):
            if term in docs[n]: df+=1
        if df>0: idf_vector[i] = np.log10([n_docs/df])
    return idf_vector

In [14]:
idf_vector = get_idf_vector(corpus)

In [15]:
print(f'Vocabulary {voc}')
print('-'*84)
print(f'IDF Vector: \n\n{idf_vector}')
print('-'*84)

Vocabulary ['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas', 'que']
------------------------------------------------------------------------------------
IDF Vector: 

[0.47712125 0.17609126 0.47712125 0.17609126 0.47712125 0.17609126
 0.17609126 0.47712125 0.47712125]
------------------------------------------------------------------------------------


In [16]:
def get_tf_idf_matrix(corpus):
    freq_matrix = frequencies_matrix(corpus)
    idf_vector = get_idf_vector(corpus)
    tf_idf_matrix = freq_matrix * idf_vector.reshape(1,-1)
    return tf_idf_matrix

In [17]:
tf_idf_matrix = get_tf_idf_matrix(corpus)

In [18]:
print(f'Vocabulary {voc}')
print('-'*84)
print(f'Documents {docs}')
print('-'*84)
print(f'Freq Matrix: \n\n{freq_matrix}')
print('-'*84)
print(f'IDF Vector: \n\n{idf_vector}')
print('-'*84)
print(f'TF-IDF Matrix: \n\n{tf_idf_matrix}')
print('-'*84)

Vocabulary ['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas', 'que']
------------------------------------------------------------------------------------
Documents {0: ['que', 'dia', 'es', 'hoy'], 1: ['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes'], 2: ['martes', 'muchas', 'gracias']}
------------------------------------------------------------------------------------
Freq Matrix: 

[[0. 1. 0. 1. 0. 1. 0. 0. 1.]
 [1. 1. 1. 1. 0. 1. 2. 0. 0.]
 [0. 0. 0. 0. 1. 0. 1. 1. 0.]]
------------------------------------------------------------------------------------
IDF Vector: 

[0.47712125 0.17609126 0.47712125 0.17609126 0.47712125 0.17609126
 0.17609126 0.47712125 0.47712125]
------------------------------------------------------------------------------------
TF-IDF Matrix: 

[[0.         0.17609126 0.         0.17609126 0.         0.17609126
  0.         0.         0.47712125]
 [0.47712125 0.17609126 0.47712125 0.17609126 0.         0.17609126
  0.35218252 0.         0.     

### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [22]:
def cosine_similarity(corpus, _id):
    voc, docs = get_vocabulary(corpus)
    tf_idf = get_tf_idf_matrix(corpus)
    row = tf_idf[_id,:]
    indexes_to_explore = range((tf_idf.shape)[0])
    idxs = set(indexes_to_explore) - set([_id])
    _idxs = list(idxs)
    doc = np.array(row)
    cosines = dict()
    for i in _idxs:
        doc_to_compare = tf_idf[i,:]
        cosines[i] = np.dot(doc, doc_to_compare) / (np.linalg.norm(doc) * (np.linalg.norm(doc_to_compare)))
    cosines_sorted = sorted(cosines, reverse=True)
    _docs = [v for k,v in docs.items() if k in cosines_sorted]
    return _docs

In [23]:
similar_docs = cosine_similarity(corpus,1)

In [20]:
voc, docs = get_vocabulary(corpus)

In [26]:
print(f'Vocabulary {voc}')
print('-'*84)
print(f'Documents {docs}')
print('-'*84)
print(f'TF-IDF Matrix: \n\n{tf_idf_matrix}')
print('-'*84)
print(f'Most similar documents to {docs[1]}:')
for sd in similar_docs:
    print(sd)

Vocabulary ['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas', 'que']
------------------------------------------------------------------------------------
Documents {0: ['que', 'dia', 'es', 'hoy'], 1: ['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes'], 2: ['martes', 'muchas', 'gracias']}
------------------------------------------------------------------------------------
TF-IDF Matrix: 

[[0.         0.17609126 0.         0.17609126 0.         0.17609126
  0.         0.         0.47712125]
 [0.47712125 0.17609126 0.47712125 0.17609126 0.         0.17609126
  0.35218252 0.         0.        ]
 [0.         0.         0.         0.         0.47712125 0.
  0.17609126 0.47712125 0.        ]]
------------------------------------------------------------------------------------
Most similar documents to ['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes']:
['que', 'dia', 'es', 'hoy']
['martes', 'muchas', 'gracias']
