In [25]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver

# Set up the web driver in headless mode
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless") 
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--allow-running-insecure-content')
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--proxy-bypass-list=*")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--proxy-server='direct://'")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--disable-software-rasterizer")
download_directory = "/Users/germankosenkov/Code projects/Crawling/Crawling Malaysian Data"
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_directory
})

driver = webdriver.Chrome(options=chrome_options)

try:
    # Navigate to the webpage containing the table
    url = "https://en.stockq.org/index/BCTI.php"
    driver.get(url)

    # Get the page source and parse it with BeautifulSoup
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the table and extract data
    table = soup.select_one('center:nth-of-type(5) table')
    
    if table:
        rows = table.find_all('tr')
        data = []
        for row in rows:
            cells = row.find_all('td')
            cell_data = [cell.get_text(strip=True) for cell in cells]
            data.append(cell_data)

        df = pd.DataFrame(data)
        # Assuming your DataFrame is called df
        df.columns = ['Date', 'Index', 'Change', 'Date', 'Index', 'Change']
        df = df.drop(0)

        left_side = df.iloc[:, :3]  
        right_side = df.iloc[:, 3:]
        df_stacked = pd.concat([left_side, right_side], ignore_index=True)

        df_stacked['Date'] = pd.to_datetime(df_stacked['Date'], errors='coerce')
        df_stacked = df_stacked.drop(columns=['Change'])
        df_stacked['Index'] = pd.to_numeric(df_stacked['Index'], errors='coerce').round(2)

        print(df_stacked)

    else:
        print("Table not found")

except Exception as e:
    print("Error:", e)
finally:
    # Close the WebDriver
    driver.quit()


         Date  Index
0  2024-08-15  619.0
1  2024-08-14  615.0
2  2024-08-13  599.0
3  2024-08-12  605.0
4  2024-08-09  622.0
5  2024-08-08  647.0
6  2024-08-07  665.0
7  2024-08-06  693.0
8  2024-08-05  724.0
9  2024-08-02  755.0
10 2024-08-01  760.0
11 2024-07-31  778.0
12 2024-07-30  796.0
13 2024-07-29  806.0
14 2024-07-26  814.0
15 2024-07-25  814.0
16 2024-07-24  816.0
17 2024-07-23  817.0
18 2024-07-22  822.0
19 2024-07-19  822.0


In [26]:
from sshtunnel import SSHTunnelForwarder
import pandas as pd
import pymysql
from datetime import datetime, timedelta
import traceback
from dateutil import relativedelta
import json as json 


b_ssh_host = X
b_ssh_user = X
b_ssh_port = X
b_ssh_private_key = X
b_sql_hostname = X
b_sql_username = X
b_sql_password = X
b_sql_database = X
b_sql_port = X

def query_data(ssh_host, ssh_user, ssh_port, ssh_private_key, sql_hostname, sql_username, sql_password, sql_database, sql_port, query):
    with SSHTunnelForwarder(
            (ssh_host, ssh_port),
            ssh_username=ssh_user,
            ssh_pkey=ssh_private_key,
            remote_bind_address=(sql_hostname, sql_port)) as tunnel:
        conn = pymysql.connect(
            host='X',
            user=sql_username,
            passwd=sql_password,
            db=sql_database,
            port=tunnel.local_bind_port
        )
        data = pd.read_sql_query(query, conn)
        conn.close()
    return data

In [27]:
import logging
from pymysql import IntegrityError, OperationalError
from sshtunnel import SSHTunnelForwarder
import pymysql

# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)  # Reduce logging level to minimize overhead

# File handler to log detailed debug info
file_handler = logging.FileHandler('debug.log')
file_handler.setLevel(logging.DEBUG)
file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)

# Console handler to log only errors or higher
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
console_formatter = logging.Formatter('%(levelname)s - %(message)s')
console_handler.setFormatter(console_formatter)

# Add handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)

# Define the batch size
BATCH_SIZE = 1000  # Adjust the batch size based on your needs

source_id = 3
index_name = 'BCTI'

def chunker(seq, size):
    """Generator to divide data into chunks."""
    for pos in range(0, len(seq), size):
        yield seq[pos:pos + size]

try:
    with SSHTunnelForwarder(
            (b_ssh_host, b_ssh_port),
            ssh_username=b_ssh_user,
            ssh_pkey=b_ssh_private_key,
            remote_bind_address=(b_sql_hostname, b_sql_port)) as tunnel:
        
        logger.info("SSH Tunnel established successfully.")
        
        try:
            b_conn = pymysql.connect(
                host='127.0.0.1',
                user=b_sql_username,
                passwd=b_sql_password,
                db=b_sql_database,
                port=tunnel.local_bind_port
            )
            logger.info("Database connection established successfully.")
            b_cursor = b_conn.cursor()

            try:


                inserting_query = '''INSERT IGNORE INTO freight 
                                     (date, index_size, index_name, source_id) 
                                     VALUES (%s, %s, %s, %s)'''

                # Prepare the list of values
                values = [
                    (
                        item['Date'], item['Index'], index_name, source_id
                    )
                    for index, item in df_stacked.iterrows()
                ]

                # Insert in batches
                for i, chunk in enumerate(chunker(values, BATCH_SIZE)):
                    logger.info(f"Inserting batch {i + 1} of {len(values) // BATCH_SIZE + 1}")
                    b_cursor.executemany(inserting_query, chunk)
                    b_conn.commit()
                    logger.info(f"Batch {i + 1} committed successfully.")

            except IntegrityError as ie:
                logger.error(f"Integrity error occurred: {ie}")
                b_conn.rollback()
                logger.info("Transaction rolled back due to IntegrityError.")

            except Exception as e:
                logger.error(f"An unexpected error occurred during query execution: {e}")
                b_conn.rollback()
                logger.info("Transaction rolled back due to an unexpected error.")

            finally:
                b_cursor.close()
                logger.info("Cursor closed.")

        except OperationalError as oe:
            logger.error(f"Operational error occurred: {oe}")

        finally:
            b_conn.close()
            logger.info("Database connection closed.")

except Exception as e:
    logger.critical(f"Critical error in establishing SSH Tunnel: {e}")
