## Web Scrap all trials
**Author:** Daniel Jaramillo

In [None]:
import pandas as pd
import re
from pathlib import Path

from selenium import webdriver
from selenium.common.exceptions import ElementNotInteractableException
from scrap_juicios import scrap_juicios

from fake_useragent import UserAgent

In [None]:
# Define Paths
db = Path.cwd().parents[1]

***
### Define functions

First function to obtain the data of the webscrap for a single query

In [None]:
def juicios(driver, dep_judicial, year, n_attempt):

    # Create id_proceso
    id_proceso = (5-len(str(n_attempt)))*'0' + str(n_attempt)

    # Call function to webscrap
    result_list = scrap_juicios(driver, dep_judicial, year, id_proceso, delay=2)

    # Convert results to pandas
    result_df = pd.DataFrame()
    for instancia in result_list:
        result_df = pd.concat([result_df, pd.DataFrame(instancia, index=[0])], ignore_index=True)

    return result_df

A Function to call the driver, and collect all the information

In [None]:
def obtener_datos(dflistos, dependencia):

    # 1 - Figure out last id of proceso: If it is the first iteration, start from 2014, and 1
    if dflistos.shape[0] == 0:
        yr_last = 2014
        num_last = 1

    else:
        last_proceso = dflistos['id_proceso'][dflistos.shape[0]-1] 
        yr_last = int(last_proceso[5:9])
        num_last = int(re.sub('\D', '', last_proceso[9:])) + 1

    # 2 - Run Webscraper
    # Define options for browser
    options = webdriver.FirefoxOptions()
    options.headless = True # do not show browser window
    options.page_load_strategy = 'none' # Dont wait page to be loaded
    options.set_preference("general.useragent.override", UserAgent().random)

    # Start Driver
    gecko_path = Path.home()/'Documents/geckodriver.exe'
    driver = webdriver.Firefox(executable_path=gecko_path, options=options)
    url = 'http://consultas.funcionjudicial.gob.ec/informacionjudicial/public/informacion.jsf'
    driver.get(url)

    # Define base for results
    results = pd.DataFrame()

    # Loop over years 
    for year in range(yr_last, 2021):

        # Loop over possible trials
        for n_attempt in range(num_last, 99999+1):

            try:
                # Scrap the data
                results_df = juicios(driver, dependencia, str(year), n_attempt)
                
                # Check if id_proceso existe
                if 'No existe este proceso' in results_df.causa[0]:
                    break

                else:
                    results = pd.concat([results, results_df], ignore_index=True)

            except ElementNotInteractableException:
                # If we cannot get the data, return the result up to that point
                driver.close()
                print('El proceso se interrumpio, seguir corriendo')
                return {'estado': False, 'df': results}
    
    # If all works good
    return {'estado': True, 'df': results}

***
### Create a loop until it finishes the collection of data

In [None]:
# Load dependencias codes
db_depen = pd.read_csv(db/'data/raw/codigos_dependencias_satje.csv', header=None, names=['id_dependencia'])

# El Codigo de dependencia tiene 5 digitos. Lo convierto a string y anado un cero a los que tienen 4
db_depen['id_dependencia'] = db_depen['id_dependencia'].apply(lambda x: "0" + str(x) if len(str(x))==4 else str(x))
db_depen = db_depen.sort_values(by='id_dependencia').reset_index(drop=True)

guayas = list(db_depen.loc[db_depen['id_dependencia'].apply(lambda x: x.startswith('09')), 'id_dependencia'])

In [None]:
# Load dataset
depnumber = guayas[1]
try:
    procesos_listos = pd.read_excel(db/f'data/raw/resumen_{depnumber}.xls')
except FileNotFoundError:
    procesos_listos = pd.DataFrame()


In [None]:
# First attempt
res = obtener_datos(procesos_listos, depnumber)
dfrestantes = pd.concat([procesos_listos, res['df']], ignore_index=True)
dfrestantes.to_excel(db/f'data/raw/resumen_{depnumber}.xls', index=False)

# Loop while false
while res['estado'] == False:
    res = obtener_datos(dfrestantes, depnumber)
    dfrestantes = pd.concat([procesos_listos, res['df']], ignore_index=True)
    dfrestantes.to_excel(db/f'data/raw/resumen_{depnumber}.xls', index=False)

# Save if true
else:
    dfrestantes = pd.concat([procesos_listos, res['df']], ignore_index=True)
    dfrestantes.to_excel(db/f'data/raw/resumen_{depnumber}.xls', index=False)
    print('Finally!!!')
