# Vectorización de texto
### Creación de los dataframe y limpieza de texto

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

df = pd.read_csv('titulosEncontrados.csv', sep=',',  encoding='utf-8')
#convierte en string los datos de la columna text
df1 = df['titulo'].apply(str)

# Fase de limpieza

In [15]:
#Limpieza de caracteres no deseados
import nltk
import re

def general(txt: str, bert=False, nums=False) -> str:
    """
    Elimina caracteres no deseados
    Params:
        **txt**:texto a ser limpiado de caracteres no desaeados
    """
    if nums:
        txt = re.sub(r'\d+', ' ', txt)
    if not bert:
        txt = txt.translate(str.maketrans(
            'áéíóúýàèìòùÁÉÍÓÚÀÈÌÒÙÝ', 'aeiouyaeiouAEIOUAEIOUY'))
        txt = re.sub(r'[^\w\s:.)-]', '', txt)

    txt = txt.replace('\r', ' ').replace("\v", ' ').replace(
        "\t", ' ').replace("\f", ' ').replace("\a", ' ').replace("\b", ' ')
    txt = re.sub(' +', ' ', txt)
    txt = txt.strip()
    return txt

dfLimpio = df1.apply(general)


In [16]:
#limpieza de stopwords

stops = nltk.corpus.stopwords.words('spanish')
# for i in stops:
#     stops.append(i.upper())


def remove_stops(texto: str) -> str:
    """
    Función que elimina stopwords
    Params:
        **texto**:texto a ser limpiado de stopwords

    """
    texto = [
        i for i in texto.split() if i not in stops
    ]
    return ' '.join(texto)

dfLimpio = dfLimpio.apply(remove_stops)

In [4]:
dfLimpio[0]

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000,
                                 min_df=0.2, stop_words=stops,
                                 use_idf=True, ngram_range=(1,5))

tfidf_matrix = tfidf_vectorizer.fit_transform(dfLimpio) #fit the vectorizer to dfLimpio

print(tfidf_matrix.shape)

In [6]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
terms

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [9]:
print(dist)

[[ 0.00000000e+00  6.42030272e-01  5.96750987e-01 ...  6.96762526e-01
   8.79703226e-01  6.02108327e-01]
 [ 6.42030272e-01  0.00000000e+00  6.86735595e-01 ...  6.13320209e-01
   9.32547374e-01  5.45434337e-01]
 [ 5.96750987e-01  6.86735595e-01 -2.22044605e-16 ...  7.44581696e-01
   9.08757726e-01  5.54541288e-01]
 ...
 [ 6.96762526e-01  6.13320209e-01  7.44581696e-01 ...  1.11022302e-16
   8.50076491e-01  7.25697005e-01]
 [ 8.79703226e-01  9.32547374e-01  9.08757726e-01 ...  8.50076491e-01
  -2.22044605e-16  9.17020168e-01]
 [ 6.02108327e-01  5.45434337e-01  5.54541288e-01 ...  7.25697005e-01
   9.17020168e-01 -2.22044605e-16]]


# K-means clustering

In [10]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [11]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words:

NameError: name 'vocab_frame' is not defined