#### Se cargan los datos al dataframe


In [2]:
import pandas as pd
import numpy as np
from pprint import pprint

df = pd.read_csv('pericias_medicas.csv', sep=',',  encoding='utf-8')
#convierte en string los datos de la columna text
df1 = df['text'].apply(str)

#### Funciones

In [3]:
import re
# Limpieza general
def general(txt: str, bert=False, nums=True) -> str:
    if nums:
        txt = re.sub(r'\d+', ' ', txt)
    if not bert:
        txt = txt.translate(str.maketrans(
            'áéíóúýàèìòùÁÉÍÓÚÀÈÌÒÙÝ', 'aeiouyaeiouAEIOUAEIOUY'))
        txt = re.sub(r'[^\w\s]', ' ', txt)
    txt = re.sub(' +', ' ', txt)
    txt = re.sub('_',' ',txt)
    txt = re.sub('__',' ',txt)
    txt = re.sub('___',' ',txt)
    txt = re.sub('____',' ',txt)
    txt = re.sub('_____',' ',txt)
    txt = re.sub('______',' ',txt)
    #txt = re.sub('aaaa',' ',txt)
    txt = re.sub('aaec',' ',txt)
    txt = re.sub('[a]+','a',txt)
    txt = re.sub('aaccuueerrddoo',' ',txt)
    txt = re.sub('aacs',' ',txt)
    txt = re.sub('aactora',' ',txt)
    txt = re.sub('aafp',' ',txt)
    txt = re.sub('aaoo',' ',txt)
    txt = re.sub('aaos',' ',txt)
    txt = re.sub('aaot',' ',txt)
    txt = re.sub('aaron',' ',txt)
    txt = re.sub('aarrtt',' ',txt)
    txt = re.sub('aartolome',' ',txt)
    txt = re.sub('aasseegguurraannddoolleess',' ',txt)
    txt = re.sub('abbott',' ',txt)
  
    txt = txt.strip()
    return txt

In [4]:
# Eliminar palabras cortas de titulos
def limpiar_palabras(text):
    text = re.sub('-',' ',text)
    text = re.sub(r'(I{1,3}|IV|V|VI{1,4}|IX|X)[). -]|[^\w\s]',' ',text)
    text=text.lower()
    text = [
        i for i in text.split() if len(i) > 3
    ]
    return ' '.join(text)

In [5]:
# Eliminar stowords
import nltk
import re

stops = nltk.corpus.stopwords.words('spanish')

def remove_stops(texto: str) -> str:
    """
    Función que elimina stopwords
    Params:
        **texto**:texto a ser limpiado de stopwords

    """
    #texto = limpiar_palabras(texto)
    texto = [
        i for i in texto.split() if i not in stops
    ]
    return ' '.join(texto)

#Limpieza del dataframe completo

In [6]:
# Limpieza de texto
# Primera etapa de la limpieza de texto
import re, string, unicodedata
'''
Se hace la eliminacion de texto entre corchetes, acentos, signos de puntuacion (excepto . y :), palabras con numeros.
Se eliminan los espacios de sobra
Se eliminan \r, \t, \v, \f, \a
'''
def limpiarTexto1(txt: str, bert=False, nums=False) -> str:
    """
    Elimina caracteres no deseados
    Params:
        **txt**:texto a ser limpiado de caracteres no desaeados
    """
    if nums:
        txt = re.sub(r'\d+', ' ', txt)
    if not bert:
        txt = txt.translate(str.maketrans(
            'áéíóúýàèìòùÁÉÍÓÚÀÈÌÒÙÝ', 'aeiouyaeiouAEIOUAEIOUY'))

    '''
    Eliminamos caracteres especiales: tabulador horizontal(\t), tabulador vertical(\v), 
    retorno de carro(\r), avance de pagina(\f), 
    caracter de retroceso: Marca el límite de una palabra(\b), 
    '''
    txt = txt.replace('\r', ' ').replace("\v", ' ').replace(
        "\t", ' ').replace("\f", ' ').replace("\a", ' ').replace("\b", ' ')
    txt = re.sub(' +', ' ', txt)
    txt = txt.strip()
    return txt
df1 = df1.apply(str)
df1 = df1.apply(limpiarTexto1)

In [7]:
# Segunda fase de limpieza
# Se eliminan todos los elementos que meten ruido al texto y que no fueron eliminados en la fase de limpieza 1.
import re

def limpiarTexto2(text):
    text = re.sub('^ ',' ',text)
    text = re.sub('\n +\n','\n',text)
    text = re.sub(' +\n\n','\n',text)
    text = re.sub('\n\n+','\n',text)
    text = re.sub(' \n','\n',text)
    text = re.sub('\d\n',' ',text)
    text = re.sub('\x0c',' ',text)
    text = re.sub('\u200b\n',' ',text)
    text = re.sub('\d{3,100}',' ',text)
    text = re.sub('\d+-\d+',' ',text)
    text = re.sub('\x0c',' ',text)
    text = re.sub('[nN]º|[nN][. ]º',' ',text)
    text = re.sub('[º<>/]',' ',text)
    text = re.sub('\d{3,100}',' ',text)
    text = re.sub('[a-zA-z-.]+@[a-zA-Z]+.com',' ',text)
    return text

df1 = df1.apply(limpiarTexto2)

In [8]:
# Tercera fase de limpieza
# Eliminamos las lineas que no son de utilidad para el analisis o que van a afectar los resultados del mismo.
# Ejemplo de linea eliminada: las lineas que comienzan con "Se encuentra contestada en....."
import re

def limpiarTexto3(text):
    text = re.sub('[a-z1-9.]+[).-] [s|S]e encuentra contestad[a|o] .+[. \n]',' ',text)
    text = re.sub('[0-9]+[. ]+[yY]a fue contestado.+[.\n]',' ',text)
    text = re.sub('[fF]oja [1-9].+\n', ' ', text)
    text = re.sub('[pP]regunta[ 0-9]+[)].+\n|[rR]espuesta[ 0-9]+[)].+\n',' ',text)#elimina oraciones comenzadas en preguta/respuesta.
    text = re.sub('V[. ]+[S\n\.]+', ' ', text)
    #text = re.sub('[IV]+.[A-Z]{1,3}[\n.]', '', text)
    text = re.sub('[I][.][P][.]',' ',text)
    text = re.sub('[I][.][T][.]',' ',text)
    text = re.sub('[I][.][A][.]',' ',text)
    text = re.sub('[I][.][L][.]',' ',text)
    text = re.sub('[I][.][B][.]',' ',text)
    text = re.sub('[I][.][N][.]',' ',text)
    text = re.sub('[I][.][V][.]',' ',text)
    text = re.sub('[V][.][M][.]',' ',text)
    text = re.sub('[V][.][A][.]',' ',text)

    return text

dfLimpio = df1.apply(limpiarTexto3)
dfLimpio = dfLimpio.apply(limpiarTexto2)
#pprint(dfLimpio[0])

#BÚSQUEDA DE TÍTULOS

## Títulos con números romanos y en mayúsculas

In [9]:
# Busca títulos en mayusculas
def buscarTitulosMayusculas(text):
    # Expresión regular para encontrar títulos en mayusculas.
    tituloMayusculas =re.compile(r'(I{1,3}|IV|V|VI{1,3}|IX|X)[-.) ]+[A-Z -]+[\:\.\n]')
    titulosMayusculasEncontrados = []

    for m in tituloMayusculas.finditer(text):
        if len(m.group()) > 8:
            titulosMayusculasEncontrados.append(m.group())

    return titulosMayusculasEncontrados
# titulosMAyuscula: lista que guarda los títulos en mayusculas
titulosMayusculas=[]
for expediente in dfLimpio:
    titulosMayusculas.append(buscarTitulosMayusculas(expediente))

dfTitulosMayusculasConStops= pd.DataFrame(titulosMayusculas)

In [10]:
# Creamos un dataframe para guardar expedientes que solamente tienen títulos en mayúsculas
# Guardo en dfTitulosMayusculas los expedientes y su índice dentro del dataframe limpio (con todos los expedientes)
expConTitulosMayusculas = []
expSinTitulosEncontrados = []
for id,titulo in  enumerate(dfLimpio):
    if len(titulosMayusculas[id]) > 0:
        expConTitulosMayusculas.append((id,titulo))
    else:
        expSinTitulosEncontrados.append((id,titulo))

dfTitulosMayusculas = pd.DataFrame(expConTitulosMayusculas, columns=['id','expediente'])
#Dataframe de los expedientes que no tienen titulos en mayusculas
dfSinTitulosEncontrados = pd.DataFrame(expSinTitulosEncontrados, columns=['id','expediente'])

In [11]:
dfTitulosMayusculas['expediente'] = dfTitulosMayusculas['expediente'].apply(general)
dfTitulosMayusculas['expediente'] = dfTitulosMayusculas['expediente'].apply(limpiar_palabras)
dfTitulosMayusculas['expediente'] = dfTitulosMayusculas['expediente'].apply(remove_stops)
#dfTitulosMayusculas.expediente[0]

####Dataframe con la posición de los titulos en romano y mayusculas

In [12]:
# Encontrar la ubicacion del titulo en el documento
titulosPosicion1=[]
tipo='encontrado'
for i in range(len(dfLimpio)):
    for titulo in titulosMayusculas[i]:
        inicioTitulo = dfLimpio[i].index(titulo)
        caracteres = len(titulo)
        titulo = re.sub('[-]',' ',titulo)
        titulo = limpiar_palabras(titulo)
        palabrasPorTitulo = len(titulo.split())
        finalTitulo = inicioTitulo + caracteres
        titulosPosicion1.append((i, tipo, titulo, inicioTitulo, finalTitulo, palabrasPorTitulo))

dfUbicacionTitulos = pd.DataFrame(titulosPosicion1,columns=['id','tipo','titulo','inicio','fin','longitud'])


In [13]:
dfUbicacionTitulos['titulo'] = dfUbicacionTitulos['titulo'].apply(general)
dfUbicacionTitulos['titulo'] = dfUbicacionTitulos['titulo'].apply(limpiar_palabras)
dfUbicacionTitulos['titulo'] = dfUbicacionTitulos['titulo'].apply(remove_stops)
#dfUbicacionTitulos

In [14]:
dfUbicacionTitulosLimpio = dfUbicacionTitulos.drop(dfUbicacionTitulos[dfUbicacionTitulos['titulo'] ==''].index)
#Aplicando esto, de 5810 filas, se reduce a 5408. 

####Counvectorizer de los titulos en romanos y mayusculas

In [15]:
#Se convierte a lista
corpusTitulosMayusculas = dfUbicacionTitulosLimpio['titulo'].tolist()
len(corpusTitulosMayusculas)
#ELIMINAR LOS ESPACIOS VACIOS

5408

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vecTitulosMayusculas = CountVectorizer(ngram_range = (1,1))
vectorMayusculas = vecTitulosMayusculas.fit_transform(corpusTitulosMayusculas)

print("Vocabulario: ", vecTitulosMayusculas.vocabulary_) 
vecTitulosMayusculas.transform(corpusTitulosMayusculas) 

pd.DataFrame(vectorMayusculas.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))],
  columns = vecTitulosMayusculas.get_feature_names())

#PROEMIO OCUPA LA POSICION 761
#EN REALIDAD LA MAQUINA NO ENTIENDE ESTAS PALABRAS. INTERNAMENTE LO NUMERA CON LA POSICIÓN QUE OCUPA

In [17]:
#Para saber en que posición se encuentra x palabra
vecTitulosMayusculas.vocabulary_.get('proemio')

760

In [None]:
#Con un min_df = 3. Por columna (palabra) tiene que estar al menos tres veces.
#Disminuye la dimensionalidad
from sklearn.feature_extraction.text import CountVectorizer
vecTitulosMayusculasMin = CountVectorizer(ngram_range = (1,3), min_df = 10)#,max_features=20
vectorMayusculasMin = vecTitulosMayusculasMin.fit_transform(corpusTitulosMayusculas)

print("Vocabulario: ", vecTitulosMayusculasMin.vocabulary_) 
vecTitulosMayusculasMin.transform(corpusTitulosMayusculas) 

pd.DataFrame(vectorMayusculasMin.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))],
  columns = vecTitulosMayusculasMin.get_feature_names())

In [19]:
#Bigramas
from sklearn.feature_extraction.text import CountVectorizer
vecTitulosMayusculasBig = CountVectorizer(ngram_range = (2,2), min_df= 3, max_features=10)
vectorMayusculasBig = vecTitulosMayusculasBig.fit_transform(corpusTitulosMayusculas)

print("Vocabulario: ", vecTitulosMayusculasBig.vocabulary_) 
vecTitulosMayusculasBig.transform(corpusTitulosMayusculas) 

pd.DataFrame(vectorMayusculasBig.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))],
  columns = vecTitulosMayusculasBig.get_feature_names())

Vocabulario:  {'interes medico': 6, 'medico legal': 7, 'consideraciones medico': 3, 'medico legales': 8, 'contestacion puntos': 4, 'puntos pericia': 9, 'historia clinica': 5, 'conclusiones medico': 2, 'antecedentes autos': 0, 'autos interes': 1}


Unnamed: 0,antecedentes autos,autos interes,conclusiones medico,consideraciones medico,contestacion puntos,historia clinica,interes medico,medico legal,medico legales,puntos pericia
titulo1,0,0,0,0,0,0,0,0,0,0
titulo2,0,0,0,0,0,0,1,1,0,0
titulo3,0,0,0,0,0,0,0,0,0,0
titulo4,0,0,0,1,0,0,0,0,1,0
titulo5,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
titulo5404,0,0,0,0,0,0,0,0,0,0
titulo5405,0,0,0,0,0,1,0,0,0,0
titulo5406,0,0,0,0,0,0,0,0,0,0
titulo5407,0,0,0,0,0,0,0,0,0,0


In [20]:
#Trigramas
from sklearn.feature_extraction.text import CountVectorizer
vecTitulosMayusculasTrig = CountVectorizer(ngram_range = (3,3), min_df= 2,max_features=10)
vectorMayusculasTrig = vecTitulosMayusculasTrig.fit_transform(corpusTitulosMayusculas)

print("Vocabulario: ", vecTitulosMayusculasTrig.vocabulary_) 
vecTitulosMayusculasTrig.transform(corpusTitulosMayusculas) 

pd.DataFrame(vectorMayusculasTrig.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))],
  columns = vecTitulosMayusculasTrig.get_feature_names())

Vocabulario:  {'interes medico legal': 8, 'consideraciones medico legales': 4, 'contestacion puntos pericia': 6, 'conclusiones medico legales': 3, 'analisis integral problematica': 0, 'antecedentes autos interes': 1, 'autos interes medico': 2, 'historia clinica actor': 7, 'respuesta puntos pericia': 9, 'contesta puntos pericia': 5}


Unnamed: 0,analisis integral problematica,antecedentes autos interes,autos interes medico,conclusiones medico legales,consideraciones medico legales,contesta puntos pericia,contestacion puntos pericia,historia clinica actor,interes medico legal,respuesta puntos pericia
titulo1,0,0,0,0,0,0,0,0,0,0
titulo2,0,0,0,0,0,0,0,0,1,0
titulo3,0,0,0,0,0,0,0,0,0,0
titulo4,0,0,0,0,1,0,0,0,0,0
titulo5,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
titulo5404,0,0,0,0,0,0,0,0,0,0
titulo5405,0,0,0,0,0,0,0,0,0,0
titulo5406,0,0,0,0,0,0,0,0,0,0
titulo5407,0,0,0,0,0,0,0,0,0,0


In [21]:
#Cuatrigrama
from sklearn.feature_extraction.text import CountVectorizer
vecTitulosMayusculasCuat = CountVectorizer(ngram_range = (4,4), min_df= 10, max_features=10)
vectorMayusculasCuat = vecTitulosMayusculasCuat.fit_transform(corpusTitulosMayusculas)

print("Vocabulario: ", vecTitulosMayusculasCuat.vocabulary_) 
vecTitulosMayusculasCuat.transform(corpusTitulosMayusculas) 

pd.DataFrame(vectorMayusculasCuat.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))],
  columns = vecTitulosMayusculasCuat.get_feature_names())

Vocabulario:  {'antecedentes interes medico legal': 2, 'consideraciones conclusiones medico legales': 5, 'procedimientos llevados cabo realizacion': 8, 'analisis integral problematica tecnicas': 0, 'contestacion puntos pericia solicitados': 6, 'antecedentes autos interes medico': 1, 'autos interes medico legal': 3, 'puntos pericia medica parte': 9, 'discusion consideraciones medico legales': 7, 'autos interes medico legales': 4}


Unnamed: 0,analisis integral problematica tecnicas,antecedentes autos interes medico,antecedentes interes medico legal,autos interes medico legal,autos interes medico legales,consideraciones conclusiones medico legales,contestacion puntos pericia solicitados,discusion consideraciones medico legales,procedimientos llevados cabo realizacion,puntos pericia medica parte
titulo1,0,0,0,0,0,0,0,0,0,0
titulo2,0,0,1,0,0,0,0,0,0,0
titulo3,0,0,0,0,0,0,0,0,0,0
titulo4,0,0,0,0,0,0,0,0,0,0
titulo5,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
titulo5404,0,0,0,0,0,0,0,0,0,0
titulo5405,0,0,0,0,0,0,0,0,0,0
titulo5406,0,0,0,0,0,0,0,0,0,0
titulo5407,0,0,0,0,0,0,0,0,0,0


In [22]:
#Pentagramas
from sklearn.feature_extraction.text import CountVectorizer
vecTitulosMayusculasPen = CountVectorizer(ngram_range = (5,5), min_df=2, max_features=10)
vectorMayusculasPen = vecTitulosMayusculasPen.fit_transform(corpusTitulosMayusculas)

print("Vocabulario: ", vecTitulosMayusculasPen.vocabulary_) 
vecTitulosMayusculasPen.transform(corpusTitulosMayusculas) 

pd.DataFrame(vectorMayusculasPen.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))],
  columns = vecTitulosMayusculasPen.get_feature_names())

Vocabulario:  {'valoracion secuela fisica segun baremo': 9, 'secuela fisica segun baremo enfermedades': 8, 'antecedentes autos interes medico legal': 1, 'puntos pericia medica parte actora': 5, 'puntos pericia psicologica parte actora': 7, 'puntos pericia medica parte demandada': 6, 'antecedentes autos importancia medico legal': 0, 'antecedentes autos interes medico legales': 2, 'documentacion importancia medicolegal obrante autos': 3, 'documental interes medico legal analizada': 4}


Unnamed: 0,antecedentes autos importancia medico legal,antecedentes autos interes medico legal,antecedentes autos interes medico legales,documentacion importancia medicolegal obrante autos,documental interes medico legal analizada,puntos pericia medica parte actora,puntos pericia medica parte demandada,puntos pericia psicologica parte actora,secuela fisica segun baremo enfermedades,valoracion secuela fisica segun baremo
titulo1,0,0,0,0,0,0,0,0,0,0
titulo2,0,0,0,0,0,0,0,0,0,0
titulo3,0,0,0,0,0,0,0,0,0,0
titulo4,0,0,0,0,0,0,0,0,0,0
titulo5,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
titulo5404,0,0,0,0,0,0,0,0,0,0
titulo5405,0,0,0,0,0,0,0,0,0,0
titulo5406,0,0,0,0,0,0,0,0,0,0
titulo5407,0,0,0,0,0,0,0,0,0,0


In [23]:
vectorMayusculasPen.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

####TfIdfVectorizer de los titulos en romanos y mayusculas

In [24]:
#CON TFIDF SE HACEN LOS TRES PASOS A LA VEZ (CALCULA EL CONTEO DE LAS PALABRAS (tf), LOS VALORES DE FRECUENCIA DE DOCUMENTO INVERSA (dft) Y LAS PUNTUACIONES DE TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfTitulosMayusculas = TfidfVectorizer(ngram_range=(1,3),min_df=10, max_features=15)
vectorTfidfMayusculas = tfidfTitulosMayusculas.fit_transform(corpusTitulosMayusculas)

print("Vocabulario: ", tfidfTitulosMayusculas.vocabulary_) 
tfidfTitulosMayusculas.transform(corpusTitulosMayusculas) 

pd.DataFrame(vectorTfidfMayusculas.toarray(), index = ['titulo'+str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))], 
  columns=tfidfTitulosMayusculas.get_feature_names())


Vocabulario:  {'antecedentes': 0, 'medico': 8, 'legal': 6, 'medico legal': 9, 'consideraciones': 3, 'legales': 7, 'consideraciones medico': 4, 'medico legales': 10, 'consideraciones medico legales': 5, 'conclusiones': 2, 'puntos': 13, 'pericia': 11, 'puntos pericia': 14, 'petitorio': 12, 'autos': 1}


Unnamed: 0,antecedentes,autos,conclusiones,consideraciones,consideraciones medico,consideraciones medico legales,legal,legales,medico,medico legal,medico legales,pericia,petitorio,puntos,puntos pericia
titulo1,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
titulo2,0.485978,0.0,0.000000,0.000000,0.000000,0.000000,0.546104,0.000000,0.409103,0.546104,0.000000,0.0,0.0,0.0,0.0
titulo3,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
titulo4,0.000000,0.0,0.000000,0.398723,0.450564,0.452103,0.000000,0.400945,0.333613,0.000000,0.401946,0.0,0.0,0.0,0.0
titulo5,0.000000,0.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
titulo5404,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.0,0.0
titulo5405,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
titulo5406,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
titulo5407,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.0,0.0


In [25]:
#Bigrama
#Sin el min muestra 1100 columnas
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfTitulosMayusculasBig = TfidfVectorizer(ngram_range=(2,2), min_df=3, max_features=10)
vectorTfidfMayusculasBig = tfidfTitulosMayusculasBig.fit_transform(corpusTitulosMayusculas)

pd.DataFrame(vectorTfidfMayusculasBig.toarray(), index = ['titulo'+str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))], 
  columns=tfidfTitulosMayusculasBig.get_feature_names())


Unnamed: 0,antecedentes autos,autos interes,conclusiones medico,consideraciones medico,contestacion puntos,historia clinica,interes medico,medico legal,medico legales,puntos pericia
titulo1,0.0,0.0,0.00000,0.00000,0.0,0.0,0.000000,0.00000,0.00000,0.0
titulo2,0.0,0.0,0.00000,0.00000,0.0,0.0,0.728618,0.68492,0.00000,0.0
titulo3,0.0,0.0,0.00000,0.00000,0.0,0.0,0.000000,0.00000,0.00000,0.0
titulo4,0.0,0.0,0.00000,0.74622,0.0,0.0,0.000000,0.00000,0.66570,0.0
titulo5,0.0,0.0,0.00000,0.00000,0.0,0.0,0.000000,0.00000,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...
titulo5404,0.0,0.0,0.00000,0.00000,0.0,0.0,0.000000,0.00000,0.00000,0.0
titulo5405,0.0,0.0,0.00000,0.00000,0.0,1.0,0.000000,0.00000,0.00000,0.0
titulo5406,0.0,0.0,0.00000,0.00000,0.0,0.0,0.000000,0.00000,0.00000,0.0
titulo5407,0.0,0.0,0.00000,0.00000,0.0,0.0,0.000000,0.00000,0.00000,0.0


In [26]:
#Trigrama

from sklearn.feature_extraction.text import TfidfVectorizer

tfidfTitulosMayusculasTrig = TfidfVectorizer(ngram_range=(3,3), min_df=3, max_features=10)
vectorTfidfMayusculasTrig = tfidfTitulosMayusculasTrig.fit_transform(corpusTitulosMayusculas)

pd.DataFrame(vectorTfidfMayusculasTrig.toarray(), index = ['titulo'+str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))], 
  columns=tfidfTitulosMayusculasTrig.get_feature_names_out())

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names_out'

In [None]:
#Cuatrigrama
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfTitulosMayusculasCuat = TfidfVectorizer(ngram_range=(4,4), min_df=3)
vectorTfidfMayusculasCuat = tfidfTitulosMayusculasCuat.fit_transform(corpusTitulosMayusculas)

pd.DataFrame(vectorTfidfMayusculasCuat.toarray(), index = ['titulo'+str(i)
  for i in range(1, 1+len(corpusTitulosMayusculas))], 
  columns=tfidfTitulosMayusculasCuat.get_feature_names_out())

Unnamed: 0,analisis funciones psiquicas evaluado,analisis integral problematica tecnicas,antecedentes autos importancia medico,antecedentes autos interes medico,antecedentes autos interes medicolegal,antecedentes importancia medico legal,antecedentes interes medico legal,antecedentes obrantes autos interes,antecedentes valor medico legal,autos importancia medico legal,...,puntos pericia psicologica parte,puntos periciales parte actora,reaccion vivencial anormal neurotica,respuesta puntos pericia partes,respuesta puntos pericia propuestos,respuesta puntos pericia solicitados,secuela fisica segun baremo,tobillo izquierdo frente perfil,valoracion secuela fisica segun,vengo presentar pericia medica
titulo1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
titulo2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
titulo3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
titulo4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
titulo5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
titulo5806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
titulo5807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
titulo5808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
titulo5809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##Títulos con números latinos y romanos en mayúsculas. Escritos en mayúsculas y minúsculas.

In [None]:
# Busca Títulos con números latinos y romanos en mayúsculas. Escritos en mayúsculas y minúsculas.
def buscarTitulosLatinosMayusculas(text):
    # Expresión regular para encontrar títulos en mayusculas.
    tituloMayusculas =re.compile(r'((I{1,3}|IV|V|VI{1,3}|IX|X)|[0-9])[.)-]+[A-Za-z -]+[:\.\n]') #[1-9]|
    titulosMayusculasEncontrados = []

    for m in tituloMayusculas.finditer(text):
        if len(m.group()) > 8 and len(m.group().split()) <= 5:# and re.findall(str(i),m.group()):
            titulosMayusculasEncontrados.append(m.group())
            #i=i+1

    return titulosMayusculasEncontrados
# titulosMAyuscula: lista que guarda los títulos en mayusculas
titulosLatinosMayusculas=[]
for expediente in dfSinTitulosEncontrados['expediente']:
    #if len(buscarTitulosLatinosMayusculas(expediente)) > 3:
        titulosLatinosMayusculas.append(buscarTitulosLatinosMayusculas(expediente))

In [None]:
# Encontrar la ubicación del titulo candidato en el documento
titulosCandidatos=[]
tipo='candidato'
expSinTitulosEncontrados = dfSinTitulosEncontrados['expediente'].apply(str)
for i in range(len(expSinTitulosEncontrados)):
    for titulo in titulosLatinosMayusculas[i]:
        inicioTitulo = expSinTitulosEncontrados[i].index(titulo)
        caracteres = len(titulo)
        titulo = re.sub('[-]',' ',titulo)
        titulo = limpiar_palabras(titulo)
        palabrasPorTitulo = len(titulo.split())
        finalTitulo = inicioTitulo + caracteres
        titulosCandidatos.append((i, tipo, titulo, inicioTitulo, finalTitulo, palabrasPorTitulo))

dfTitulosCandidatos = pd.DataFrame(titulosCandidatos,columns=['id','tipo','titulo','inicio','fin','longitud'])
#dfTitulosCandidatos

In [None]:
dfTitulosCandidatos['titulo'] = dfTitulosCandidatos['titulo'].apply(general)
dfTitulosCandidatos['titulo'] = dfTitulosCandidatos['titulo'].apply(limpiar_palabras)
dfTitulosCandidatos['titulo'] = dfTitulosCandidatos['titulo'].apply(remove_stops)
#dfTitulosCandidatos

####CountVectorizer de los titulos candidatos

In [None]:
#Se convierte a lista
corpusTitulosCandidatos = dfTitulosCandidatos['titulo'].tolist()
len(corpusTitulosCandidatos)
#ELIMINAR LOS ESPACIOS VACIOS

11182

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vecTitulosCandidatos = CountVectorizer(ngram_range = (1,3), min_df= 10 ,max_features=15)
vectorCandidatos = vecTitulosCandidatos.fit_transform(corpusTitulosCandidatos)

print("Vocabulario: ", vecTitulosCandidatos.vocabulary_) 
vecTitulosCandidatos.transform(corpusTitulosCandidatos) 

pd.DataFrame(vectorCandidatos.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosCandidatos))],
  columns = vecTitulosCandidatos.get_feature_names())

Vocabulario:  {'examen': 5, 'fisico': 7, 'examen fisico': 6, 'consideraciones': 2, 'medico': 11, 'legales': 10, 'consideraciones medico': 3, 'medico legales': 12, 'consideraciones medico legales': 4, 'conclusiones': 1, 'informe': 9, 'psicodiagnostico': 13, 'antecedentes': 0, 'incapacidad': 8, 'respuesta': 14}


Unnamed: 0,antecedentes,conclusiones,consideraciones,consideraciones medico,consideraciones medico legales,examen,examen fisico,fisico,incapacidad,informe,legales,medico,medico legales,psicodiagnostico,respuesta
titulo1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
titulo2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
titulo3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
titulo4,0,0,1,1,1,0,0,0,0,0,1,1,1,0,0
titulo5,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
titulo11178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
titulo11179,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
titulo11180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
titulo11181,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#Minimo de 3

from sklearn.feature_extraction.text import CountVectorizer
vecTitulosCandidatosMin = CountVectorizer(ngram_range = (1,1), min_df=3)
vectorCandidatosMin = vecTitulosCandidatosMin.fit_transform(corpusTitulosCandidatos)

pd.DataFrame(vectorCandidatosMin.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosCandidatos))],
  columns = vecTitulosCandidatosMin.get_feature_names_out())

Unnamed: 0,abduccion,abombamiento,accidente,acredita,actitud,actividad,actor,actora,actual,actualmente,...,vicente,villasenor,vinculante,visual,visum,vivencial,volcado,wiesel,york,zimmerman
titulo1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
titulo2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
titulo3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
titulo4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
titulo5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
titulo11178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
titulo11179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
titulo11180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
titulo11181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#Bigrama
#Si no le pongo un minimo, son 2423 columnas

from sklearn.feature_extraction.text import CountVectorizer
vecTitulosCandidatosBig = CountVectorizer(ngram_range = (2,2), min_df=3,max_features=10)
vectorCandidatosBig = vecTitulosCandidatosBig.fit_transform(corpusTitulosCandidatos)

pd.DataFrame(vectorCandidatosBig.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosCandidatos))],
  columns = vecTitulosCandidatosBig.get_feature_names())

Unnamed: 0,antecedentes personales,asociacion ideas,consideraciones medico,datos personales,estudios complementarios,examen fisico,examenes complementarios,historia clinica,medico legales,remitir informe
titulo1,0,0,0,0,0,1,0,0,0,0
titulo2,0,0,0,0,0,0,0,0,0,0
titulo3,0,0,0,0,0,0,1,0,0,0
titulo4,0,0,1,0,0,0,0,0,1,0
titulo5,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
titulo11178,0,0,0,0,0,0,0,0,0,0
titulo11179,0,0,0,0,0,0,0,0,0,0
titulo11180,0,0,0,0,0,0,0,0,0,0
titulo11181,0,0,0,0,0,0,0,0,0,0


In [None]:
#Trigrama

from sklearn.feature_extraction.text import CountVectorizer
vecTitulosCandidatosTrig = CountVectorizer(ngram_range = (3,3), min_df=3,max_features=10)
vectorCandidatosTrig = vecTitulosCandidatosTrig.fit_transform(corpusTitulosCandidatos)

pd.DataFrame(vectorCandidatosTrig.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosCandidatos))],
  columns = vecTitulosCandidatosTrig.get_feature_names())

Unnamed: 0,anamnesis examen fisico,anatomo clinico funcional,antecedentes personales familiares,conclusiones medico legales,consideraciones medico legales,examen anatomo clinico,item examen fisico,orientacion autopsiquica alopsiquica,posibilidades reubicacion laboral,remitirse examen fisico
titulo1,0,0,0,0,0,0,0,0,0,0
titulo2,0,0,0,0,0,0,0,0,0,0
titulo3,0,0,0,0,0,0,0,0,0,0
titulo4,0,0,0,0,1,0,0,0,0,0
titulo5,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
titulo11178,0,0,0,0,0,0,0,0,0,0
titulo11179,0,0,0,0,0,0,0,0,0,0
titulo11180,0,0,0,0,0,0,0,0,0,0
titulo11181,0,0,0,0,0,0,0,0,0,0


In [None]:
#Cuatrigrama

from sklearn.feature_extraction.text import CountVectorizer
vecTitulosCandidatosCuat = CountVectorizer(ngram_range = (4,4), min_df=3,max_features=10)
vectorCandidatosCuat = vecTitulosCandidatosCuat.fit_transform(corpusTitulosCandidatos)

pd.DataFrame(vectorCandidatosCuat.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosCandidatos))],
  columns = vecTitulosCandidatosCuat.get_feature_names())

Unnamed: 0,antecedentes interes medico legal,antecedentes medico legales importancia,conclusiones consideraciones medico legales,considero actualmente tratamiento quirurgico,examen anatomo clinico funcional,examen clinico semiologico actor,historia clinica hospital italiano,pericia medica incapacidad otorgada,refiere haber requerido tratamiento
titulo1,0,0,0,0,0,0,0,0,0
titulo2,0,0,0,0,0,0,0,0,0
titulo3,0,0,0,0,0,0,0,0,0
titulo4,0,0,0,0,0,0,0,0,0
titulo5,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
titulo11178,0,0,0,0,0,0,0,0,0
titulo11179,0,0,0,0,0,0,0,0,0
titulo11180,0,0,0,0,0,0,0,0,0
titulo11181,0,0,0,0,0,0,0,0,0


In [None]:
#Pentagrama
#Se repite una sola vez

from sklearn.feature_extraction.text import CountVectorizer
vecTitulosCandidatosPen = CountVectorizer(ngram_range = (5,5))
vectorCandidatosPen = vecTitulosCandidatosPen.fit_transform(corpusTitulosCandidatos)

pd.DataFrame(vectorCandidatosPen.toarray(), index = ['titulo' + str(i)
  for i in range(1, 1+len(corpusTitulosCandidatos))],
  columns = vecTitulosCandidatosPen.get_feature_names())

Unnamed: 0,perito presenta informe medico legal
titulo1,0
titulo2,0
titulo3,0
titulo4,0
titulo5,0
...,...
titulo11178,0
titulo11179,0
titulo11180,0
titulo11181,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfTitulosCandidatos = TfidfVectorizer(ngram_range=(1,3),min_df=10, max_features=15)
vectorTfidfCandidatos = tfidfTitulosCandidatos.fit_transform(corpusTitulosCandidatos)

print("Vocabulario: ", tfidfTitulosCandidatos.vocabulary_) 
tfidfTitulosCandidatos.transform(corpusTitulosCandidatos) 

pd.DataFrame(vectorTfidfCandidatos.toarray(), index = ['titulo'+str(i)
  for i in range(1, 1+len(corpusTitulosCandidatos))], 
  columns=tfidfTitulosCandidatos.get_feature_names())

Vocabulario:  {'examen': 5, 'fisico': 7, 'examen fisico': 6, 'consideraciones': 2, 'medico': 11, 'legales': 10, 'consideraciones medico': 3, 'medico legales': 12, 'consideraciones medico legales': 4, 'conclusiones': 1, 'informe': 9, 'psicodiagnostico': 13, 'antecedentes': 0, 'incapacidad': 8, 'respuesta': 14}


Unnamed: 0,antecedentes,conclusiones,consideraciones,consideraciones medico,consideraciones medico legales,examen,examen fisico,fisico,incapacidad,informe,legales,medico,medico legales,psicodiagnostico,respuesta
titulo1,0.0,0.000000,0.000000,0.000000,0.000000,0.512305,0.608163,0.606367,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
titulo2,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
titulo3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
titulo4,0.0,0.000000,0.400809,0.422726,0.422726,0.000000,0.000000,0.000000,0.0,0.0,0.407383,0.387079,0.407635,0.0,0.0
titulo5,0.0,0.517702,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.502053,0.477031,0.502364,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
titulo11178,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
titulo11179,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
titulo11180,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
titulo11181,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0
