# Vectorizacion de texto

# MODELO 1.

In [1]:
#  TOPIC MODELING
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import string
from nltk.corpus import stopwords
import json
import glob
import re

def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = pd.read_csv(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


In [2]:
def remove_stops(text, stops):
    text = re.sub(r'(I{1,3}|IV|V|VI{1,3}|IX|X)[-.) ]+','',text)
    text = re.sub(r'[^\w\s -]','',text)
    text = re.sub(r'-',' ',text)
    text = text.lower()
    words = text.split()
    final = []
    for word in words:
        if word not in stops:
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("", "", string.punctuation))
    final = "".join([i for i in final if not i.isdigit()])
    while "  " in final:
        final = final.replace("  ", " ")
    return (final)

def clean_docs(docs):
    stops = stopwords.words("spanish")
    final = []
    for doc in docs:
        clean_doc = remove_stops(doc, stops)
        final.append(clean_doc)
    return (final)

In [3]:
texto = load_data("titulosEncontrados.csv")

# print (descriptions[0])

cleaned_docs = clean_docs(texto['titulo'])
# print (cleaned_docs[0])


In [4]:
stops = stopwords.words("spanish")

vectorizer = TfidfVectorizer(
                                lowercase=False,
                                use_idf=True,
                                max_features=20000,
                                max_df=0.99,
                                min_df=50,
                                ngram_range = (1,3),
                                stop_words = stops
                                

                            )

vectors = vectorizer.fit_transform(cleaned_docs)

feature_names = vectorizer.get_feature_names()

dense = vectors.todense()
denselist = dense.tolist()

all_keywords = []

for description in denselist:
    x=0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(feature_names[x])
        x=x+1
    all_keywords.append(keywords)


In [5]:
print (all_keywords)

[[], ['antecedentes', 'antecedentes interes', 'interes', 'interes medico', 'interes medico legal', 'legal', 'medico', 'medico legal'], ['actor', 'examen', 'examen fisico', 'fisico'], ['consideraciones', 'consideraciones medico', 'consideraciones medico legales', 'legales', 'medico', 'medico legales'], ['conclusiones'], ['incapacidad'], ['contestacion', 'contestacion puntos', 'contestacion puntos pericia', 'pericia', 'puntos', 'puntos pericia'], ['petitorio'], [], ['antecedentes', 'antecedentes interes', 'autos', 'interes', 'interes medico', 'medico'], ['examen', 'perito'], ['consideraciones'], ['contestacion', 'contestacion puntos', 'periciales', 'puntos', 'puntos periciales'], ['bibliografia'], ['conclusiones'], ['petitorio'], ['clinica', 'historia', 'historia clinica'], ['periciales', 'puntos', 'puntos periciales'], ['estudios', 'solicitados'], ['conclusiones', 'conclusiones medico', 'conclusiones medico legales', 'consideraciones', 'legales', 'medico', 'medico legales'], [], ['clini

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(vectors)

In [7]:
dist

array([[ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  9.26062696e-01],
       [ 1.00000000e+00,  1.00000000e+00, -2.22044605e-16, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       ...,
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00, -2.22044605e-16,  1.00000000e+00],
       [ 1.00000000e+00,  9.26062696e-01,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00, -2.22044605e-16]])

In [8]:
true_k = 20

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

model.fit(vectors)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

with open ("clusters_results.txt", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")

In [9]:
terms

['actor',
 'actora',
 'alicia',
 'alicia victoria',
 'analisis',
 'antecedentes',
 'antecedentes autos',
 'antecedentes autos interes',
 'antecedentes interes',
 'art',
 'autos',
 'autos interes',
 'autos interes medico',
 'bibliografia',
 'clinica',
 'clinica actor',
 'columna',
 'complementarios',
 'conclusiones',
 'conclusiones medico',
 'conclusiones medico legales',
 'consideraciones',
 'consideraciones medico',
 'consideraciones medico legales',
 'consideraciones medicolegales',
 'contesta',
 'contesta puntos',
 'contesta puntos pericia',
 'contestacion',
 'contestacion puntos',
 'contestacion puntos pericia',
 'datos',
 'datos personales',
 'diagnostico',
 'estudios',
 'estudios complementarios',
 'examen',
 'examen actor',
 'examen fisico',
 'fisico',
 'hechos',
 'historia',
 'historia clinica',
 'historia clinica actor',
 'identificacion',
 'incapacidad',
 'integral',
 'integral problematica',
 'interes',
 'interes medico',
 'interes medico legal',
 'legal',
 'legales',
 'medi

In [10]:
order_centroids

array([[13, 75, 27, ..., 53, 54,  0],
       [36,  0, 34, ..., 23, 22, 75],
       [22, 23, 52, ..., 50, 51,  0],
       ...,
       [55, 51, 54, ..., 52, 53, 37],
       [40,  5,  0, ..., 53, 54, 37],
       [25, 27, 26, ..., 50, 51,  0]], dtype=int64)

## CountVectorizer

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

text = load_data("titulosEncontrados.csv")['titulo']
coun_vect = CountVectorizer(max_features = 25)
count_matrix = coun_vect.fit_transform(cleaned_docs)
count_array = count_matrix.toarray()
dfCountVec = pd.DataFrame(data=count_array,columns = coun_vect.get_feature_names())
dfCountVec

Unnamed: 0,actor,actora,antecedentes,autos,clinica,complementarios,conclusiones,consideraciones,contestacion,datos,...,legal,legales,medico,medicolegales,parte,pericia,personales,petitorio,puntos,solicitados
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,1,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5772,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5773,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5774,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
print(coun_vect.vocabulary_)

{'antecedentes': 2, 'interes': 14, 'medico': 17, 'legal': 15, 'examen': 11, 'actor': 0, 'consideraciones': 7, 'legales': 16, 'conclusiones': 6, 'contestacion': 8, 'puntos': 23, 'pericia': 20, 'petitorio': 22, 'autos': 3, 'historia': 13, 'clinica': 4, 'estudios': 10, 'solicitados': 24, 'datos': 9, 'personales': 21, 'hechos': 12, 'complementarios': 5, 'actora': 1, 'medicolegales': 18, 'parte': 19}


In [13]:
print(coun_vect.get_feature_names())

['actor', 'actora', 'antecedentes', 'autos', 'clinica', 'complementarios', 'conclusiones', 'consideraciones', 'contestacion', 'datos', 'estudios', 'examen', 'hechos', 'historia', 'interes', 'legal', 'legales', 'medico', 'medicolegales', 'parte', 'pericia', 'personales', 'petitorio', 'puntos', 'solicitados']
