# Código redes de sentencias de la Corte Constitucional

    Autores: Andrés Felipe Patiño y Juan Carlos Rodríguez Raga
    Universidad de los Andes
    2020-20

In [None]:
# paquetes necesarios para la extracción de sentencias
import os
import requests
import urllib.parse
from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
from random import sample
# paquetes necesarios para constituir la red
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import infomap
import numpy as np

## Extracción de sentencias

In [None]:
# Carpeta para guardar sentencias
folder_location = './sentencias_completas'
if not os.path.exists(folder_location):
    os.mkdir(folder_location)

In [None]:
# Listas necesarias para extraer las sentencias de la página 
añosn = ['92','93','94','95','96','97','98','99']
añosd = ['00','01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20']
pages = ['0', '1']

In [None]:
# Extractor de sentencias por año
# si no funciona, agregar la función try
urls=[]
for i in añosn:
    for p in pages:
        url= "https://www.corteconstitucional.gov.co/relatoria/radicador/buscar.php?ponente=&demandado=&Sentencia=&Tipo=Sentencias&proceso=&busqueda=&conector=AND&segundotema=&anios="+str(i)+"&pg="+str(p)+"&vs=0&accion=Buscar"
        urls.append(url)
links=[]
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    results = soup.find_all('td', attrs={'width': '100%'})
    for result in results:
        hipervinculo=result.find("a")
        if hipervinculo != None:
            link=hipervinculo.get("href")
            link_completo=urllib.parse.urljoin(url,link)
            links.append(link_completo)
            filename=os.path.join(folder_location,link.split('/')[-1])
            with open(filename, 'wb') as f:
                f.write(requests.get(link_completo).content)

In [None]:
urls=[]
for i in añosd:
    for p in pages:
        url= "https://www.corteconstitucional.gov.co/relatoria/radicador/buscar.php?ponente=&demandado=&Sentencia=&Tipo=Sentencias&proceso=&busqueda=&conector=AND&segundotema=&anios="+str(i)+"&pg="+str(p)+"&vs=0&accion=Buscar"
        urls.append(url)
links=[]
for url in urls:
    while True:
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, "html.parser")
            results = soup.find_all('td', attrs={'width': '100%'})
            for result in results:
                hipervinculo=result.find("a")
                if hipervinculo != None:
                    link=hipervinculo.get("href")
                    link_completo=urllib.parse.urljoin(url,link)
                    links.append(link_completo)
                    filename=os.path.join(folder_location,link.split('/')[-1])
                    with open(filename, 'wb') as f:
                        f.write(requests.get(link_completo).content)
        except requests.ConnectionError as e:
            print("OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
            print(str(e))
            continue
        except requests.Timeout as e:
            print("OOPS!! Timeout Error")
            print(str(e))
            continue
        except requests.RequestException as e:
            print("OOPS!! General Error")
            print(str(e))
            continue
        except KeyboardInterrupt:
            print("Someone closed the program")
            continue
        break

In [None]:
print(len(links))

In [None]:
# código para extraer sólamente una muestra (debe reemplazarse en el cuadro anterior)
#muestra = sample(links,1022)
#for link in muestra:
 #   filename=os.path.join(folder_location,link.split('/')[-1])
  #  with open(filename, 'wb') as f:
   #     f.write(requests.get(link).content)

##  Extracción partes

In [None]:
# definición de la carpeta donde se encuentran las sentencias
files = os.listdir("./sentencias_completas/")

In [None]:
#Código para dividir las sentencias por partes según subtítulos de interés (Antecedentes; Consideraciones|Fundamentos; Decisión|Resuelve)
sentdct = {}
noencon = []
for i,file in enumerate(files):
    with open(str("./sentencias_completas/"+files[i]), encoding="Windows-1252", errors='ignore') as f:  
        soup = BeautifulSoup(f, "html.parser")
        f.close()
        btags = soup.find_all("b")
        trbtags = str(btags).replace("\n", " ")
        htags = soup.find_all(re.compile("h[0-9]|h"))
        trhtags = str(htags).replace("\n", " ")
        etags = soup.find_all("p", class_="Estilo")
        tretags = str(etags).replace("\n", " ")
        mtags = soup.find_all("p", class_=re.compile("MsoHeading[0-9]|MsoHeading")) 
        trmtags = str(mtags).replace("\n", " ")
        ANT = re.findall("ANTECEDENTES|Antecedentes|A.N.T.E.C.E.D.E.N.T.E.S|A.n.t.e.c.e.d.e.n.t.e.s|INFORMACI.N|Informaci.n", trbtags)
        CONS = re.findall("CONSIDERACIONES|Consideraciones|FUNDAMENTO|Fundamento|F.U.N.D.A.M.E.N.T.O|F.u.n.d.a.m.e.n.t.o", trbtags)
        RES = re.findall("DECISI.N|Decisi.n|D.E.C.I.S.I...N|D.e.c.i.s.i...|RESUELVE|Resuelve|R.E.S.U.E.L.V.E|R.e.s.u.e.l.v.e", trbtags)
        ANT1 = re.findall("ANTECEDENTES|Antecedentes|A.N.T.E.C.E.D.E.N.T.E.S|A.n.t.e.c.e.d.e.n.t.e.s|INFORMACI.N|Informaci.n", trhtags)
        CONS1 = re.findall("CONSIDERACIONES|Consideraciones|FUNDAMENTO|Fundamento|F.U.N.D.A.M.E.N.T.O|F.u.n.d.a.m.e.n.t.o", trhtags)
        RES1 = re.findall("DECISI.N|Decisi.n|D.E.C.I.S.I...N|D.e.c.i.s.i...|RESUELVE|Resuelve|R.E.S.U.E.L.V.E|R.e.s.u.e.l.v.e",trhtags)
        ANT2 = re.findall("ANTECEDENTES|Antecedentes|A.N.T.E.C.E.D.E.N.T.E.S|A.n.t.e.c.e.d.e.n.t.e.s|INFORMACI.N|Informaci.n", tretags)
        CONS2 = re.findall("CONSIDERACIONES|Consideraciones|FUNDAMENTO|Fundamento|F.U.N.D.A.M.E.N.T.O|F.u.n.d.a.m.e.n.t.o", tretags)
        RES2 = re.findall("DECISI.N|Decisi.n|D.E.C.I.S.I...N|D.e.c.i.s.i...|RESUELVE|Resuelve|R.E.S.U.E.L.V.E|R.e.s.u.e.l.v.e", tretags)
        ANT3 = re.findall("ANTECEDENTES|Antecedentes|A.N.T.E.C.E.D.E.N.T.E.S|A.n.t.e.c.e.d.e.n.t.e.s|INFORMACI.N|Informaci.n", trmtags)
        CONS3 = re.findall("CONSIDERACIONES|Consideraciones|FUNDAMENTO|Fundamento|F.U.N.D.A.M.E.N.T.O|F.u.n.d.a.m.e.n.t.o", trmtags)
        RES3 = re.findall("DECISI.N|Decisi.n|D.E.C.I.S.I...N|D.e.c.i.s.i...|RESUELVE|Resuelve|R.E.S.U.E.L.V.E|R.e.s.u.e.l.v.e", trmtags)
        sent = re.findall("\w+.\d{3}-\d{2}|\w+.\d{4}-\d{2}|\w+.\d{3}\w+-\d{2}|\w+.\d{4}\w+-\d{2}", str(soup))
        if sent == []:
            nsent = str(re.sub(".htm", "", files[i]))
            noencon.append("identificador %s" % (nsent))
        else:
            rawsent = str(re.sub(".htm", "", files[i]))
            b = re.search("SU", rawsent)
            if b == None:
                nsent = str(rawsent)
            else:
                nsent = str(re.sub("SU", "SU-", rawsent))
            DICTU = {"id":nsent}
            ANTt = ANT + ANT1 + ANT2 + ANT3
            CONSt = CONS + CONS1 + CONS2 + CONS3
            RESt = RES + RES1 + RES2 + RES3
            RESt.reverse()
            with open("./sentencias_completas/"+files[i], mode="r", errors='ignore') as bigfile:
                reader = bigfile.read()
                text = reader.replace("\n", " ")
                bigfile.close()
                if RESt != [] and CONSt != [] and ANTt != []:
                    for i,part in enumerate(re.split(RESt[0], text, maxsplit=1)):
                        with open("RESm_" + str(i+1), mode="w") as newfileres:
                            newfileres.write(RESt[0]+part)
                    repart1 = open("RESm_1", "r")
                    departr = open("RESm_2", "r")
                    DICTU["RES"] = (departr.read())
                    for i,part in enumerate(re.split(CONSt[0], repart1.read(), maxsplit=1)):
                        with open("CONSm_" + str(i+1), mode="w") as newfilecons:
                            newfilecons.write(CONSt[0]+part)
                    repart2 = open("CONSm_1", "r")
                    departc = open("CONSm_2", "r")
                    DICTU["CUERPO"] = (departc.read())
                    for i,part in enumerate(re.split(ANTt[0], repart2.read(), maxsplit=1)):
                        with open("ANTm_" + str(i+1), mode="w") as newfileant:
                            newfileant.write(ANTt[0]+part)
                    departh1 = open("ANTm_1", "r")
                    departh2 = open("ANTm_2", "r")
                    DICTU["ANT"] = (departh2.read())
                    DICTU["ENC"] = (departh1.read())
                elif RESt != [] and CONSt != [] and ANTt == []:
                    for i,part in enumerate(re.split(RESt[0], reader, maxsplit=1)):
                        with open("RESm_" + str(i+1), mode="w") as newfileres:
                            newfileres.write(RESt[0]+part)
                    repart1 = open("RESm_1", "r")
                    departr = open("RESm_2", "r")
                    DICTU["RES"] = (departr.read())
                    for i,part in enumerate(re.split(CONSt[0], repart1.read(), maxsplit=1)):
                        with open("CONSm_" + str(i+1), mode="w") as newfilecons:
                            newfilecons.write(CONSt[0]+part)
                    repart2 = open("CONSm_1", "r")
                    departc = open("CONSm_2", "r")
                    DICTU["CUERPO"] = (departc.read())
                    DICTU["ENC"] = (repart2.read())
                elif RESt != [] and CONSt == [] and ANTt == []:
                    for i,part in enumerate(re.split(RESt[0], reader, maxsplit=1)):
                        with open("RESm_" + str(i+1), mode="w") as newfileres:
                            newfileres.write(RESt[0]+part)
                    repart1 = open("RESm_1", "r")
                    departr = open("RESm_2", "r")
                    DICTU["RES"] = (departr.read())
                    DICTU["CUERPO"] = (repart1.read())
                elif RESt != [] and CONSt == [] and ANTt != []:
                    for i,part in enumerate(re.split(RESt[0], reader, maxsplit=1)):
                        with open("RESm_" + str(i+1), mode="w") as newfileres:
                            newfileres.write(RESt[0]+part)
                    repart1 = open("RESm_1", "r")
                    departr = open("RESm_2", "r")
                    DICTU["RES"] = (departr.read())
                    DICTU["CUERPO"] = (repart1.read())
                else:
                    DICTU["CUERPO"] = text
                    print("%s_error5" % (nsent))
            sentdct['%s' % (nsent)] = DICTU

In [None]:
print(len(noencon))

## Recolección de precedentes

In [None]:
# Listas necesarias para mantener el mismo formato de años
nyears = ["1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999"]
dyears = ["2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]

In [None]:
# Código que extrae los precedentes de cada sentencia según la parte de interés (Cuerpo: Consideraciones|Fundamentos y su respectivos pies de página)
# ardict es el diccionario que contiene cada identificador de la sentencia (ID) con sus conexiones (precedentes)
# Label es la etiqueta para utilizar en Gephi y Tipo es el tipo de sentencia para poder filtrar en la red
netdict = {}
conexiones = []
ids = []
tipos = []
for key in sentdct.keys():
    pint1 = sentdct[key].get("CUERPO")
    soupcu = BeautifulSoup(pint1, "html.parser")
    results1 = soupcu.find_all("a")
    pint2 = sentdct[key].get("RES")
    precedentes = []
    precedentesf = []
    if pint2 != None:
        soupres = BeautifulSoup(pint2, "html.parser")
        results2 = soupres.find_all("p", class_= "MsoFootnoteText")
        footname1 = [] 
        footname2 = []
        clearnames = []
        resultsdef = []
        for result1 in results1:
            footname1.append(result1.get("href"))
        for foot in footname1:
            if foot != None:
                clearnames.append(foot.replace("#", ""))
        for result2 in results2:
            apart = result2.find("a")
            if apart != None:
                name = apart.get("name")
                for clearname in clearnames:
                    if name == clearname:
                        resultsdef.append(result2)
        format1c = re.findall("(C|T|SU)[-| ](\d+A|\d+) \w+ \d+ de \w+ \w+ (\d+)", pint1)
        for f1c in format1c:
            ele = list(f1c)
            eled = []
            for e in ele: 
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
        format1r = re.findall("(C|T|SU)[-| ](\d+A|\d+) \w+ \d+ de \w+ \w+ (\d+)", str(resultsdef))
        for f1r in format1r:
            ele = list(f1r)
            eled = []
            for e in ele: 
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
        format2c = re.findall("(C|T|SU)[-| ](\d+A|\d+)/(\d+)", pint1)
        for f2c in format2c:
            ele = list(f2c)
            eled = []
            for e in ele: 
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
        format2r = re.findall("(C|T|SU)[-| ](\d+A|\d+)/(\d+)", str(resultsdef))
        for f2r in format2r:
            ele = list(f2r)
            eled = []
            for e in ele: 
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
        format3c = re.findall("(C|T|SU)[-| ](\d+) \w+ ([0-9][0-9][0-9][0-9])", pint1)
        for f3c in format3c:
            ele = list(f3c)
            eled = []
            for e in ele: 
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
        format3r = re.findall("(C|T|SU)[-| ](\d+) \w+ ([0-9][0-9][0-9][0-9])", str(resultsdef))
        for f3r in format3r:
            ele = list(f3r)
            eled = []
            for e in ele: 
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
    if pint2 == None:
        format1c = re.findall("(C|T|SU)[-| ](\d+A|\d+) \w+ \d+ de \w+ \w+ (\d+)", pint1)
        for f1c in format1c:
            ele = list(f1c)
            eled = []
            for e in ele: 
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
        format2c = re.findall("(C|T|SU)[-| ](\d+A|\d+)/(\d+)", pint1)
        for f2c in format2c:
            ele = list(f2c)
            eled = []
            for e in ele: 
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
        format3c = re.findall("(C|T|SU)[-| ](\d+) \w+ ([0-9][0-9][0-9][0-9])", pint1)
        for f3c in format3c:
            ele = list(f3c)
            eled = []
            for e in ele:
                if e in nyears:
                    eled.append(re.sub("19", "", e))
                if e in dyears:
                    eled.append(re.sub("20", "", e))
                else:
                    eled.append(e)
            if len(eled) >= 4:
                eled.pop()
            precedentes.append("-".join(eled))
    precedentesf = re.findall("\w+-\d{3}-\d{2}|\w+-\d{4}-\d{2}|\w+-\d{3}A-\d{2}|\w+-\d{4}A-\d{2}", str(precedentes))
    ids.append(key)
    conexiones.append(precedentesf)
    tipos.append(key[0])
netdict["Label"] = ids
netdict["Tipo"] = tipos
netdict["Id"] = ids
netdict["conexiones"] = conexiones

In [None]:
print(len(netdict["Id"]))

## Conformación de la red

### Creación de nodos y arcos

In [None]:
#Creación del dataframe de nodos
nodos = pd.DataFrame(netdict)
nodos.head()

In [None]:
#Creación de cada conexión entre sentencias para que sea una a una
sources = []
target = []
for i in range(len(nodos)):
    conexiones = nodos["conexiones"].values[i]
    for j in range(len(conexiones)):
        sources.append(nodos["Id"].values[i])
        target.append(conexiones[j])

In [None]:
#Creación del dataframe de arcos, donde hay una entrada para cada sentencia (source) que cita a otra (target)
arcos = pd.DataFrame(list(zip(sources, target)), 
               columns =['Source', 'Target'])
arcos.head()

In [None]:
# Exportación a excel sin columna de index para facilitar el paso al programa Ghepi
arcos.to_excel("arcos.xlsx", index = None)
nodos.to_excel("nodos.xlsx", index = None)

In [None]:
# Exportación a formato de separación por comas
nodos.to_csv("nodos_vf.csv", index = None)
arcos.to_csv("arcos_vf.csv", index = None)

### Red

In [None]:
#Para crear la red con los archivos omitiendo los pasos anteriores
nodos = pd.read_csv('nodos_vf.csv', sep=',')
arcos = pd.read_csv('arcos_vf.csv', sep=',')

In [None]:
# Creación de la red dirigida, según los dataframes de nodos y arcos
G = nx.DiGraph()

#Adicionar los nodos y sus atributos a la red
for i in range(len(nodos['Id'])):   
    G.add_node(nodos['Id'][i])
    for j in nodos:
        if j!= 'Id':
            G.nodes[nodos['Id'][i]][j]=nodos[j][i]
            
            
# Adicionar los arcos de la red
for i in range(len(arcos['Source'])):
    G.add_edge(arcos['Source'][i],arcos['Target'][i])   


In [None]:
# para observar el número total de arcos y nodos
print(G.number_of_edges())
print(G.number_of_nodes())

### Primeros acercamientos al análisis de redes

In [None]:
#Gráfico de la red (costoso en términos de tiempo)
fig=plt.figure(figsize=(100,100))

# Después se genera el gráfico
k = nx.draw(G, node_size=50, node_color='lightgreen', edge_color='lightgray', with_labels=False)
plt.axis("tight")
plt.show()

In [None]:
#Distrucion del grado
grado=[]
for node in G.nodes():
    grado.append(G.degree(node))
    
# Histograma
a=plt.hist(grado)
plt.title('Histograma del grado de la red')
plt.xlabel ("Grado k")
plt.ylabel ("N(k)")
plt.show()

# Distribución
degree_freq = nx.degree_histogram(G)
degrees = range(len(degree_freq))
plt.loglog(degrees, degree_freq, 'o') 
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.title('Distribución logarítmica del grado de la red')
plt.show()

In [None]:
#Para conocer las sentencia más conectada
dic_grados=G.in_degree()
mas_conectado=''
max_grado = 0
for i,values_i in dic_grados:
    if values_i > max_grado:
        max_grado=values_i
        mas_conectado=G.nodes[i]['Label']
        
print('El nodo más conectado es', mas_conectado, 'con un grado de', max_grado)

In [None]:
#Lista con las 10 sentencias más citadas
sorted_deg =  sorted(dic_grados, key=lambda item: item[1], reverse=True)
pd.DataFrame(sorted_deg[0:11])

In [None]:
#Promedio del grado de entrada
degrees = []
for n in G.nodes():
    degrees.append(G.in_degree(n))
np.mean(degrees)

In [None]:
#Densidad de la red
nx.density(G)

### Detección de comunidades (No se ha probado)

#### Infomap

In [None]:
def findCommunities(G):
    """
    Partition network with the Infomap algorithm.
    Annotates nodes with 'community' id and return number of communities found.
    """
    infomapX = infomap.Infomap("--two-level")

    print("Building Infomap network from a NetworkX graph...")
    for e in G.edges():
        infomapX.network().addLink(*e)

    print("Find communities with Infomap...")
    infomapX.run();

    print("Found {} modules with codelength: {}".format(infomapX.numTopModules(), infomapX.codelength()))

    communities = {}
    for node in infomapX.iterLeafNodes():
        communities[node.physicalId] = node.moduleIndex()

    nx.set_node_attributes(G, values=communities, name='community')

In [None]:
def drawNetwork(G):
    # position map
    pos = nx.spring_layout(G)
    # community ids
    communities = [v for k,v in nx.get_node_attributes(G, 'community').items()]
    numCommunities = max(communities) + 1
    # color map from http://colorbrewer2.org/
    cmapLight = colors.ListedColormap(['#a6cee3', '#b2df8a', '#fb9a99', '#fdbf6f', '#cab2d6'], 'indexed', numCommunities)
    cmapDark = colors.ListedColormap(['#1f78b4', '#33a02c', '#e31a1c', '#ff7f00', '#6a3d9a'], 'indexed', numCommunities)

    # Draw edges
    nx.draw_networkx_edges(G, pos)

    # Draw nodes
    nodeCollection = nx.draw_networkx_nodes(G,
        pos = pos,
        node_color = communities,
        cmap = cmapLight
    )
    # Set node border color to the darker shade
    darkColors = [cmapDark(v) for v in communities]
    nodeCollection.set_edgecolor(darkColors)

    # Draw node labels
    for n in G.nodes():
        plt.annotate(n,
            xy = pos[n],
            textcoords = 'offset points',
            horizontalalignment = 'center',
            verticalalignment = 'center',
            xytext = [0, 0],
            color = cmapDark(communities[n])
        )

    plt.axis('off')
    # plt.savefig("karate.png")
    plt.show()

In [None]:
findCommunities(K)

drawNetwork(K)

#### Louvain

In [None]:
import community

In [None]:
P= nx.to_undirected(G)

In [None]:
partition = community.best_partition(P)
values = [partition.get(node) for node in P.nodes()]

In [None]:
spring_pos=nx.spring_layout(P)
plt.figure(figsize=(10,10))
nx.draw(P,pos=spring_pos,node_color = values, node_size=30)

## Para hacer pruebas, no usar

#### Códigos para hacer pruebas individuales a cada sentencia

In [None]:
sentdct = {}
noencon = []
with open(str("./sentencias_completas/"+files[1]), encoding="Windows-1252") as f:   
    soup = BeautifulSoup(f, "html.parser")
    f.close()
    btags = soup.find_all("b")
    htags = soup.find_all(re.compile("h[0-9]|h"))
    etags = soup.find_all("p", class_="Estilo")
    mtags = soup.find_all("p", class_=re.compile("MsoHeading[0-9]|MsoHeading"))    
    ANT = re.findall("ANTECEDENTES|Antecedentes|A.N.T.E.C.E.D.E.N.T.E.S|A.n.t.e.c.e.d.e.n.t.e.s|INFORMACI.N|Informaci.n", str(btags))
    CONS = re.findall("CONSIDERACIONES|Consideraciones|FUNDAMENTO|Fundamento|F.U.N.D.A.M.E.N.T.O|F.u.n.d.a.m.e.n.t.o", str(btags))
    RES = re.findall("DECISI.N|Decisi.n|D.E.C.I.S.I...N|D.e.c.i.s.i...|RESUELVE|Resuelve|R.E.S.U.E.L.V.E|R.e.s.u.e.l.v.e|R\nE.S.U.E.L.V.E|R\ne.s.u.e.l.v.e", str(btags))
    ANT1 = re.findall("ANTECEDENTES|Antecedentes|A.N.T.E.C.E.D.E.N.T.E.S|A.n.t.e.c.e.d.e.n.t.e.s|INFORMACI.N|Informaci.n", str(htags))
    CONS1 = re.findall("CONSIDERACIONES|Consideraciones|FUNDAMENTO|Fundamento|F.U.N.D.A.M.E.N.T.O|F.u.n.d.a.m.e.n.t.o", str(htags))
    RES1 = re.findall("DECISI.N|Decisi.n|D.E.C.I.S.I...N|D.e.c.i.s.i...|RESUELVE|Resuelve|R.E.S.U.E.L.V.E|R.e.s.u.e.l.v.e|R\nE.S.U.E.L.V.E|R\ne.s.u.e.l.v.e", str(htags))
    ANT2 = re.findall("ANTECEDENTES|Antecedentes|A.N.T.E.C.E.D.E.N.T.E.S|A.n.t.e.c.e.d.e.n.t.e.s|INFORMACI.N|Informaci.n", str(etags))
    CONS2 = re.findall("CONSIDERACIONES|Consideraciones|FUNDAMENTO|Fundamento|F.U.N.D.A.M.E.N.T.O|F.u.n.d.a.m.e.n.t.o", str(etags))
    RES2 = re.findall("DECISI.N|Decisi.n|D.E.C.I.S.I...N|D.e.c.i.s.i...|RESUELVE|Resuelve|R.E.S.U.E.L.V.E|R.e.s.u.e.l.v.e|R\nE.S.U.E.L.V.E|R\ne.s.u.e.l.v.e", str(etags))
    ANT3 = re.findall("ANTECEDENTES|Antecedentes|A.N.T.E.C.E.D.E.N.T.E.S|A.n.t.e.c.e.d.e.n.t.e.s|INFORMACI.N|Informaci.n", str(mtags))
    CONS3 = re.findall("CONSIDERACIONES|Consideraciones|FUNDAMENTO|Fundamento|F.U.N.D.A.M.E.N.T.O|F.u.n.d.a.m.e.n.t.o", str(mtags))
    RES3 = re.findall("DECISI.N|Decisi.n|D.E.C.I.S.I...N|D.e.c.i.s.i...|RESUELVE|Resuelve|R.E.S.U.E.L.V.E|R.e.s.u.e.l.v.e|R\nE.S.U.E.L.V.E|R\ne.s.u.e.l.v.e", str(mtags))
    sent = re.findall("([CcTtSs].[0-9][0-9][0-9]-[0-9][0-9]|[CcTtSs].[0-9][0-9][0-9][Aa]-[0-9][0-9]|[CcTtSs].[0-9][0-9][0-9][0-9]-[0-9][0-9]|[CcTtSs].[0-9][0-9][0-9][0-9][Aa]-[0-9][0-9])", str(soup))
    if sent == []:
        noencon.append("identificador %d" % (i))
    else:
        rawsent = str(re.sub(".htm", "", files[1]))
        b = re.search("SU", rawsent)
        if b == None:
            nsent = str(rawsent)
        else:
            nsent = str(re.sub("SU", "SU-", rawsent))
        DICTU = {"id":nsent}
        ANTt = ANT + ANT1 + ANT2 + ANT3
        CONSt = CONS + CONS1 + CONS2 + CONS3
        RESt = RES + RES1 + RES2 + RES3
        RESt.reverse()

In [None]:
with open("./sample_sentencias/"+files[3], mode="r") as bigfile:
    reader = bigfile.read()
    bigfile.close()
    if RESt != [] and CONSt != [] and ANTt != []:
        for i,part in enumerate(re.split(RESt[0], reader, maxsplit=1)):
            with open("RESm_" + str(i+1), mode="w") as newfileres:
                newfileres.write(RESt[0]+part)
        repart1 = open("RESm_1", "r")
        departr = open("RESm_2", "r")
        DICTU["RES"] = (departr.read())
        for i,part in enumerate(re.split(CONSt[0], repart1.read(), maxsplit=1)):
            with open("CONSm_" + str(i+1), mode="w") as newfilecons:
                newfilecons.write(CONSt[0]+part)
        repart2 = open("CONSm_1", "r")
        departc = open("CONSm_2", "r")
        DICTU["CONS"] = (departc.read())
        for i,part in enumerate(re.split(ANTt[0], repart2.read(), maxsplit=1)):
             with open("ANTm_" + str(i+1), mode="w") as newfileant:
                newfileant.write(ANTt[0]+part)
        departh1 = open("ANTm_1", "r")
        departh2 = open("ANTm_2", "r")
        DICTU["ANT"] = (departh2.read())
        DICTU["ENC"] = (departh1.read())
    elif RESt != [] and CONSt != [] and ANTt == []:
        for i,part in enumerate(re.split(RESt[0], reader, maxsplit=1)):
            with open("RESm_" + str(i+1), mode="w") as newfileres:
                newfileres.write(RESt[0]+part)
        repart1 = open("RESm_1", "r")
        departr = open("RESm_2", "r")
        DICTU["RES"] = (departr.read())
        for i,part in enumerate(re.split(CONSt[0], repart1.read(), maxsplit=1)):
            with open("CONSm_" + str(i+1), mode="w") as newfilecons:
                newfilecons.write(CONSt[0]+part)
        repart2 = open("CONSm_1", "r")
        departc = open("CONSm_2", "r")
        DICTU["CONS"] = (departc.read())
        DICTU["ENC"] = (repart2.read())
    elif RESt != [] and CONSt == [] and ANTt == []:
        for i,part in enumerate(re.split(RESt[0], reader, maxsplit=1)):
            with open("RESm_" + str(i+1), mode="w") as newfileres:
                newfileres.write(RESt[0]+part)
        repart1 = open("RESm_1", "r")
        departr = open("RESm_2", "r")
        DICTU["RES"] = (departr.read())
        DICTU["CUERPO"] = (repart1.read())
    elif RESt != [] and CONSt == [] and ANTt != []:
        print("%s_error4" % (nsent))
    else:
        print("%s_error5" % (nsent))
sentdct['%s' % (nsent)] = DICTU

In [None]:
ardict = {}
pint1 = sentdct["C-1195-01"].get("CUERPO")
pint2 = sentdct["C-1195-01"].get("RES")
soupcu = BeautifulSoup(pint1, "html.parser")
soupres = BeautifulSoup(pint2, "html.parser")
results1 = soupcu.find_all("a")
results2 = soupres.find_all("p", class_= "MsoFootnoteText")
precedentes = []
if results2 != []:
    footname1 = [] 
    footname2 = []
    clearnames = []
    resultsdef = []
    for result1 in results1:
        footname1.append(result1.get("href"))
    for foot in footname1:
        if foot != None:
            clearnames.append(foot.replace("#", ""))
    for result2 in results2:
        apart = result2.find("a")
        if apart != None:
            name = apart.get("name")
            for clearname in clearnames:
                if name == clearname:
                    resultsdef.append(result2)
    format1c = re.findall("(C|T|S)[-| ](\d+A|\d+) del [0-9][0-9] de \w+ de (\d+)", pint1)
    for f1c in format1c:
        ele = list(f1c)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
    format1r = re.findall("(C|T|S)[-| ](\d+A|\d+) del [0-9][0-9] de \w+ de (\d+)", pint2)
    for f1r in format1r:
        ele = list(f1r)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
    format2c = re.findall("(C|T|S)[-| ](\d+A|\d+)/(\d+)", pint1)
    for f2c in format2c:
        ele = list(f2c)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
    format2r = re.findall("(C|T|S)[-| ](\d+A|\d+)/(\d+)", str(resultsdef))
    for f2r in format2r:
        ele = list(f2r)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
    format3c = re.findall("(C|T|S)[-| ](\d+A|\d+) d[e|el] (\d+)", pint1)
    for f3c in format3c:
        ele = list(f3c)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
    format3r = re.findall("(C|T|S)[-| ](\d+A|\d+) d[e|el] (\d+)", str(resultsdef))
    for f3r in format3r:
        ele = list(f3r)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
if results2 == []:
    format1c = re.findall("(C|T|S)[-| ](\d+A|\d+) del [0-9][0-9] de \w+ de (\d+)", pint1)
    for f1c in format1c:
        ele = list(f1c)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
    format2c = re.findall("(C|T|S)[-| ](\d+A|\d+)/(\d+)", pint1)
    for f2c in format2c:
        ele = list(f2c)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
    format3c = re.findall("(C|T|S)[-| ](\d+A|\d+) d[e|el] (\d+)", pint1)
    for f3c in format3c:
        ele = list(f3c)
        eled = []
        for e in ele: 
            if e in nyears:
                eled.append(re.sub("19", "", e))
            if e in dyears:
                eled.append(re.sub("20", "", e))
            else:
                eled.append(e)
        if len(eled) >= 4:
            eled.pop()
        precedentes.append("-".join(eled))
precedentesf = re.findall("[C|T|SU]-\d{3}-\d{2}|[C|T|SU]-\d{4}-\d{2}", str(precedentes))
ardict["sent"] = precedentesf