In [1]:
import django, os, json, time, re
from datetime import datetime, date
from selenium.webdriver.common.by import By
from django.conf import settings

from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

os.environ['DJANGO_SETTINGS_MODULE'] = 'Homebase.settings'
django.setup()

In [2]:
def find_element_with_wait(by=None, value=None):
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((by, value))
    )
    return element

def find_elements_with_wait(by=None, value=None):
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_elements_located((by, value))
    )
    return element

In [3]:
month_translation_dict = {
    "DIC": 12,
    "NOV": 11,
    "OCT": 10,
    "SEP": 9,
    "AGO": 8,
    "JUL": 7,
    "JUN": 6,
    "MAY": 5,
    "ABR": 4,
    "MAR": 3,
    "FEB": 2,
    "ENE": 1
}

In [4]:
# Load the CREDENTIALS configuration
with open("credentials.conf") as f:
    credentials = json.load(f)

# Assign the CREDENTIALS to settings
settings.CREDENTIALS = credentials


In [5]:
from models.BBVA_scraper import BBVAScraper
bs = BBVAScraper()

In [6]:
bs.driver = bs.get_driver(False)



Current google-chrome version is 114.0.5735
Get LATEST chromedriver version for 114.0.5735 google-chrome
Driver [/Users/eliasmattson/.wdm/drivers/chromedriver/mac64/114.0.5735.90/chromedriver] found in cache


In [7]:
bs.driver.get("https://www.bbva.es/")

In [8]:
bs.log_in()

In [9]:
# Go to movements
bs.close_modal()

In [14]:
start = bs.driver.current_url.split("#")[0]
bs.driver.get(start + "#cuentas/1/ficha")

In [15]:
tbody = bs.find_element_with_wait(by=By.TAG_NAME, value="tbody")

In [16]:
movement_elements = tbody.find_elements(by=By.XPATH, value=".//tr[@role='row']")

In [17]:
def get_date_from_mov_element(mov_element):
    date_el = mov_element.find_element(by=By.XPATH, value=".//div[@class='contieneFechas']")\
        .find_element(by=By.TAG_NAME, value="b")
    
    span_elements = date_el.find_elements(by=By.TAG_NAME, value="span")
    day = int(span_elements[0].get_attribute("innerHTML")\
            .replace(" ", ""))

    if len(span_elements) > 1:
        year = int(span_elements[1].get_attribute("innerHTML")\
            .replace(" ", ""))
    else:
        year = datetime.today().year

    month_regex = "[A-Z]{3}."
    month = re.findall(month_regex, date_el.get_attribute("innerHTML"))[0].replace(".", "")
    month = month_translation_dict.get(month)

    return date(year, month, day)

In [18]:
def get_category_from_mov_element(mov_element):
    category = mov_element.find_element(by=By.XPATH, value=".//td[@role='gridcell']")\
        .find_element(by=By.XPATH, value=".//i[@role='img']").get_attribute("aria-label")
    return category

In [19]:
def get_amount_from_mov_element(mov_element):
    amount = mov_element.find_element(by=By.XPATH, value=".//span[@class='mensaje']").get_attribute("innerHTML")\
        .replace(" ", "").replace("€", "")
    return amount

In [20]:
def get_data_from_mov_element(mov_element):
    date = get_date_from_mov_element(mov_element)

    category = get_category_from_mov_element(mov_element)

    amount = get_amount_from_mov_element(mov_element)

    return {
        "date": str(date),
        "category": category,
        "amount": amount
    }

In [21]:
data = []
columns = ["date", "category", "amount"]
for mov in movement_elements:
    mov_data = get_data_from_mov_element(mov)
    
    data.append([mov_data.get("date"), mov_data.get("category"), mov_data.get("amount")])

In [22]:
df = pd.DataFrame(data, columns=columns)