In [45]:
#1. CREATE EXCEL FILE CALLED "TBMC_ITEMS_TO_MONITOR.xlsx" AS BASE SOURCE OF DATA FOR PRICE TO MONITOR, URL AND OTHER DATA.
#2. DEVELOP THE APP
#2.X SAVE/UPDATE THE RESULT TO AN EXCEL FILE CALLED "PRICE_RECORDINGS.xlsx"
#3. USE AIRFLOW TO SCHEDULE THE TASK

In [46]:
import pandas as pd
import time
import os
from datetime import datetime
from openpyxl import load_workbook

In [47]:
pd.__version__

'2.2.2'

In [48]:
#2. DEVELOP THE APP
#2.1 create necessary variables, flags & dataframe for scrapped data "scrapped_data_df"
file_path = "TBMC_ITEMS_TO_MONITOR.xlsx"
columns = ['description', 'type', 'url', 'price', 'date_stamp']
scrapped_data_df = pd.DataFrame(columns=columns)

In [49]:
#2.2 xlsx to dict - initialize the process by storing the required items (products to monitor) in dictionary called "products"
def read_excel_file(file_path):
    try:
        df = pd.read_excel(file_path)
    except FileNotFoundError:
        print("File not found.")
        return {}
    
    expected_headers = ["description", "type", "url"]
    actual_headers = df.columns.tolist()
    
    if actual_headers != expected_headers:
        print("Headers are not as expected.")
        return {}
    
    products = []
    for index, row in df.iterrows():
        description = row["description"]
        product_type = row["type"]
        url = row["url"]
        
        product = {
            "description": description,
            "type": product_type,
            "url": url
        }
        
        products.append(product)
    
    return products

In [50]:
# #2.3 main_processor - loop through the dict, perform scrapping, and add records to scrapped_data_df
def core_func(products):
    # columns = ["description", "type", "url", "price", "time_stamp", "status"]
    columns = ["description", "type", "url", "time_stamp"]
    scrapped_data_df = pd.DataFrame(columns=columns)

    
    
    for i in products:
        #Generate "price", "time_stamp" and "status" here.!!!
        current_time_stamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        new_row = [
            i['description'],
            i['type'],
            i['url'],
            current_time_stamp
        ]

        scrapped_data_df.loc[len(scrapped_data_df)] = new_row
    
    return scrapped_data_df

In [51]:
#2.4 append (or create PRICE_RECORDINGS.xlsx) using the dataframe
# def update_price_recordings(scrapped_data_df):
#     file_name="PRICE_RECORDINGS4.xlsx"
#     sheet_name = 'Sheet1'
    
#     # Check if the file exists
#     if not os.path.exists(file_name):
#         # If the file does not exist, create it and write the scrapped_data_df to it
#         with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
#             scrapped_data_df.to_excel(writer, index=False, sheet_name=sheet_name)
#     else:
#         # If the file exists, load it and append the data
#         book = load_workbook(file_name)
#         writer = pd.ExcelWriter(file_name, engine='openpyxl')
#         writer.book = book
        
#         # Check if the specified sheet exists
#         if sheet_name in book.sheetnames:
#             # Get the last row in the existing Excel sheet
#             startrow = book[sheet_name].max_row
#         else:
#             startrow = 0
        
#         # Append the data
#         scrapped_data_df.to_excel(writer, startrow=startrow, index=False, header=startrow==0, sheet_name=sheet_name)
        
#         # Save and close the file
#         writer.close()

def update_price_recordings(scrapped_data_df):
    file_name = "PRICE_RECORDINGS6.xlsx"
    sheet_name = 'Sheet1'
    
    # Check if the file exists
    if not os.path.exists(file_name):
        # If the file does not exist, create it and write the scrapped_data_df to it
        with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
            scrapped_data_df.to_excel(writer, index=False, sheet_name=sheet_name)
    else:
        # If the file exists, load it and append the data
        book = load_workbook(file_name)
        
        # Ensure at least one sheet is visible
        if all(sheet.sheet_state == 'hidden' for sheet in book.worksheets):
            book.create_sheet(title=sheet_name)
        
        with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
            writer.book = book
            writer.sheets = {ws.title: ws for ws in book.worksheets}
            
            # Check if the specified sheet exists
            if sheet_name in writer.sheets:
                # Get the last row in the existing Excel sheet
                startrow = writer.sheets[sheet_name].max_row
            else:
                startrow = 0
            
            # Append the data
            scrapped_data_df.to_excel(writer, startrow=startrow, index=False, header=startrow==0, sheet_name=sheet_name)
            
            # Save and close the file
            writer.save()

In [52]:
#2.5 create main function 
def main():
    # # 2.1 create necessary variables, flags & dataframe for scrapped data "scrapped_data_df"
    # file_path = "TBMC_ITEMS_TO_MONITOR.xlsx"
    # columns = ['description', 'type', 'url', 'price', 'date_stamp']
 

    # scrapped_data_df = pd.DataFrame(columns=columns)

    # 2.2 xlsx to dict - initialize the process by storing the required items (products to monitor) in dictionary called "products"
    products = read_excel_file(file_path)

    # 2.3 main_processor - loop through the dict, perform scrapping, and add records to scrapped_data_df
    scrapped_data_df = core_func(products)

    # 2.4 append (or create PRICE_RECORDINGS.xlsx) using the dataframe
    update_price_recordings(scrapped_data_df)

if __name__ == "__main__":
    main()

BadZipFile: File is not a zip file

In [86]:
#3. USE AIRFLOW TO SCHEDULE THE TASK

#OTHER NOTES TO CONSIDER
# If getting the data on the url has failed, handle it.


In [87]:
# def update_price_recordings(scrapped_data_df):
#     # Define the file name
#     file_name = 'PRICE_RECORDINGS.xlsx'

#     # Check if the file exists
#     file_exists = os.path.exists(file_name)

#     if file_exists:
#         # Load existing data to check for duplicate records
#         existing_df = pd.read_excel(file_name)
#         new_records = scrapped_data_df #!!!!!!
#         # Identify new records that are not in the existing data
# #         new_records = scrapped_data_df[~scrapped_data_df['ID'].isin(existing_df['ID'])]
#     else:
#         # If file doesn't exist, all records are new
#         new_records = scrapped_data_df

#     # Append new records to the Excel file
#     with pd.ExcelWriter(file_name, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:

#         if file_exists:
#             startrow = writer.sheets['Sheet1'].max_row
#         else:
#             startrow = 0
#         new_records.to_excel(writer, index=False, header=not file_exists, startrow=startrow)

#     print(f"Records have been updated and saved to {file_name}")



# def update_price_recordings(scrapped_data_df):
#     # Define the file name
#     file_name = 'PRICE_RECORDINGS3.xlsx'

#     # Check if the file exists
#     file_exists = os.path.exists(file_name)

#     if file_exists:
#         # Load existing data to check for duplicate records
#         existing_df = pd.read_excel(file_name, engine='openpyxl')
#         # Identify new records that are not in the existing data
# #         new_records = scrapped_data_df[~scrapped_data_df['ID'].isin(existing_df['ID'])]
#     else:
#         # If file doesn't exist, all records are new
#         new_records = scrapped_data_df

#     # Append new records to the Excel file
#     with pd.ExcelWriter(file_name, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:
#         if file_exists:
#             # Write existing data and get the last row
#             existing_df.to_excel(writer, index=False, sheet_name='Sheet1', startrow=0)
#             startrow = len(existing_df) + 1
#         else:
#             startrow = 0
#         new_records.to_excel(writer, index=False, header=not file_exists, startrow=startrow, sheet_name='Sheet1')

#     print(f"Records have been updated and saved to {file_name}")

