# Web Scraping and Introductory Data Analysis
// purpose of assintment


// summary of what we're going to do

In [None]:
%pip install -r ./../requirements.txt

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
NUMBER_BLOCKS = 3
ETHERESCAN_URL = "https://etherscan.io/txs"

In [None]:
class EthereumScraping:
    url = ETHERESCAN_URL
    columns = ['tnx_hash', 'method', 'block', 'date', 'from', 'to', 'value', 'tnx_fee']

    def __init__(self, number_block: int=10) -> None:
        self.number_block = number_block
        self.driver = None
        self.df = pd.DataFrame(columns=self.columns)

    def __del__(self) -> None:
        if self.driver:
            self.driver.quit()

    def _get_data_from_td_tag(self, element: bs4.element.Tag) -> str:
        return element.find('a').get('href').split('/')[-1] 

    def _collect_data_from_tr_tag(self, elements: bs4.element.ResultSet)-> pd.core.series.Series:
        return pd.Series(
            [
                elements[1].text.strip(),
                elements[2].text.strip(),
                elements[3].text.strip(),
                elements[4].text.strip(),
                self._get_data_from_td_tag(elements[7]),
                self._get_data_from_td_tag(elements[9]),
                elements[10].text.strip(),
                elements[11].text.strip()
            ],
            index=self.columns

        ), int(elements[3].text.strip())


    def _extract_data_from_html(self, html_content: str) -> int:
        soup = BeautifulSoup(html_content, "html.parser")
        rows = soup.find_all("tr")
        block_number = 0
        for row in rows:
            cells = row.find_all("td")
            series, block = self._collect_data_from_tr_tag(cells)
            block_number = max(block_number, block)
            self.df = pd.concat([self.df, pd.DataFrame([series])], ignore_index=True)

        return block_number

            
    def _extract_data_from_url(self) -> int:
        return self._extract_data_from_html(
            self.driver.find_element(
                By.CSS_SELECTOR, "tbody.align-middle.text-nowrap"
            ).get_attribute("outerHTML")
        )
    
    def _click_next_button(self) -> None:
        try:
            WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a[aria-label='Next']"))
            ).click()
        except Exception as e:
            print(f"Error clicking the 'Next' button: {e}")
            raise

    def _extract_data(self) -> None:
        self.driver = webdriver.Chrome()
        self.driver.get(self.url)
        block_number = new_block_number  = self._extract_data_from_url()
        while (block_number - new_block_number) < self.number_block:
            self._click_next_button()
            new_block_number = self._extract_data_from_url()
        
    def get_data(self) -> pd.core.frame.DataFrame:
        self._extract_data()
        return self.df
        
      





In [None]:
scripEther = EthereumScraping(NUMBER_BLOCKS)
df = scripEther.get_data()


In [None]:
df