In [277]:
#1. CREATE EXCEL FILE CALLED "TBMC_ITEMS_TO_MONITOR.xlsx" AS BASE SOURCE OF DATA FOR PRICE TO MONITOR, URL AND OTHER DATA.
#2. DEVELOP THE APP
#2.X SAVE/UPDATE THE RESULT TO AN EXCEL FILE CALLED "PRICE_RECORDINGS.xlsx"
#3. USE AIRFLOW TO SCHEDULE THE TASK

In [278]:
import pandas as pd
import time
import os
from datetime import datetime
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import requests

In [279]:
pd.__version__

'1.3.4'

In [280]:
#2. DEVELOP THE APP
#2.1 create necessary variables, flags & dataframe for scrapped data "scrapped_data_df"
file_path = "TBMC_ITEMS_TO_MONITOR.xlsx"
columns = ['description', 'type', 'url', 'price', 'date_stamp']
scrapped_data_df = pd.DataFrame(columns=columns)

In [281]:
#2.2 xlsx to dict - initialize the process by storing the required items (products to monitor) in dictionary called "products"
def read_excel_file(file_path):
    try:
        df = pd.read_excel(file_path)
    except FileNotFoundError:
        print("File not found.")
        return {}
    
    expected_headers = ["description", "type", "url", "identifier"]
    actual_headers = df.columns.tolist()
    
    if actual_headers != expected_headers:
        print("Headers are not as expected.")
        return {}
    
    products = []
    for index, row in df.iterrows():
        description = row["description"]
        product_type = row["type"]
        url = row["url"]
        identifier = row["identifier"]
        
        product = {
            "description": description,
            "type": product_type,
            "url": url,
            "identifier": identifier 
        }
        
        products.append(product)
    
    return products

In [None]:
def price_checker(url, identifier):
    # Initialize variables to store price and status
    price = None
    status = None
    
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the element with the specified identifier
            element = soup.find(id=identifier)
            
            if element:
                # Extract the price from the element (assuming it's text)
                price = element.text.strip()
                status = 'Success'
            else:
                status = f'Element with identifier "{identifier}" not found'
        else:
            status = f'Request failed with status code {response.status_code}'
            
    except requests.RequestException as e:
        status = f'Request failed: {str(e)}'
    
    return {'price': price, 'status': status}

In [282]:
# #2.3 main_processor - loop through the dict, perform scrapping, and add records to scrapped_data_df
def core_func(products):
    # columns = ["description", "type", "url", "price", "time_stamp", "status"]
    columns = ["description", "type", "url", "identifier", "price", "time_stamp"]
    scrapped_data_df = pd.DataFrame(columns=columns)

    
    
    for i in products:
        #Generate "price", "time_stamp" and "status" here.!!!
        current_time_stamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        price = 220 #sample only
        
        
        new_row = [
            i['description'],
            i['type'],
            i['url'],
            i['identifier'],
            price,
            current_time_stamp
        ]

        scrapped_data_df.loc[len(scrapped_data_df)] = new_row
        
    print(scrapped_data_df.dtypes)
    return scrapped_data_df

In [283]:
#2.4 append (or create PRICE_RECORDINGS.xlsx) using the dataframe
def update_price_recordings(scrapped_data_df):
    file_name = "PRICE_RECORDINGS_NEW1.xlsx"
    sheet_name = 'Sheet1'
    
    # # Convert columns to appropriate types
    # scrapped_data_df['time_stamp'] = pd.to_datetime(scrapped_data_df['time_stamp'])
    # scrapped_data_df['url'] = scrapped_data_df['url'].astype(str)
    # scrapped_data_df['description'] = scrapped_data_df['description'].astype(str)
    # scrapped_data_df['type'] = scrapped_data_df['type'].astype('category')
    # # Add other conversions as needed
    
    # Check if the file exists
    if not os.path.exists(file_name):
        # If the file does not exist, create it and write the scrapped_data_df to it
        try:
            with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
                scrapped_data_df.to_excel(writer, index=False, sheet_name=sheet_name)
        except Exception as e:
            print(f"Error creating file: {e}")
            return False
    else:
        try:
            # If the file exists, load it and append the data
            book = load_workbook(file_name)
            
            # Ensure at least one sheet is visible
            if all(sheet.sheet_state == 'hidden' for sheet in book.worksheets):
                book.create_sheet(title=sheet_name)
            
            with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
                writer.book = book
                writer.sheets = {ws.title: ws for ws in book.worksheets}
                
                # Check if the specified sheet exists
                if sheet_name in writer.sheets:
                    # Get the last row in the existing Excel sheet
                    startrow = writer.sheets[sheet_name].max_row
                else:
                    startrow = 0
                
                # Append the data
                scrapped_data_df.to_excel(writer, startrow=startrow, index=False, header=startrow==0, sheet_name=sheet_name)
                
                # Save and close the file
                writer.save()
        except Exception as e:
            print(f"Error appending data: {e}")
            return False
    
    return True

In [284]:
#2.5 create main function 
def main():
    # # 2.1 create necessary variables, flags & dataframe for scrapped data "scrapped_data_df"
    # file_path = "TBMC_ITEMS_TO_MONITOR.xlsx"
    # columns = ['description', 'type', 'url', 'price', 'date_stamp']
 

    # scrapped_data_df = pd.DataFrame(columns=columns)

    # 2.2 xlsx to dict - initialize the process by storing the required items (products to monitor) in dictionary called "products"
    products = read_excel_file(file_path)

    # 2.3 main_processor - loop through the dict, perform scrapping, and add records to scrapped_data_df
    scrapped_data_df = core_func(products)

    # 2.4 append (or create PRICE_RECORDINGS.xlsx) using the dataframe
    update_price_recordings(scrapped_data_df)

if __name__ == "__main__":
    main()

description    object
type           object
url            object
identifier     object
price          object
time_stamp     object
dtype: object


In [285]:
#3. USE AIRFLOW TO SCHEDULE THE TASK

#OTHER NOTES TO CONSIDER
# If getting the data on the url has failed, handle it.


In [286]:
# The warning about unreadable content in Excel files typically occurs when Excel detects something unexpected or inconsistent in the file structure. This can happen due to a few reasons:

# Data Type Mismatch: Excel expects data types to be consistent within columns. If your DataFrame contains mixed data types in a column (e.g., numbers and text mixed), Excel might flag this as unreadable content.

# File Corruption: Errors during file writing or closing can sometimes corrupt the file, leading to unreadable content warnings.