In [1]:
# refactored_file.py

import json
# from commmon_functions_gyg import GYG_Scraper
import logging
import traceback
import sys
import os
import pandas as pd

# # Get the current working directory instead of using __file__
# current_dir = os.getcwd()

# # Add the parent directory to the system path
# sys.path.append(os.path.abspath(os.path.join(current_dir, '..')))

# Now you can import the modules
import common_functions
import Azure_stopVM



In [2]:
css_selectors = {
    'currency': 'div[data-test*="dropdown-currency"]',
    'currency_list': 'section[class*="row-start-center"]',
    'products_count': 'span[data-test-id*="search-component-activity-count-text"]',
    'view_more_button': 'button[data-test-id="search-component-test-btn"]',
    'show_more_button': 'a[data-qa-marker*="loading-button"]',
    'product_card': 'div[data-test*="ActivityCard"]',
    'tour_price': 'span[data-test="realPrice"]',
    'tour_price_discount': 'div[class="tour-scratch-price"]',
    'ratings': 'div[data-test="reviewTest"]',
    'review_count': 'p[class*="reviewsNumber"]',
    'category_label': 'div[data-test="main-category"]',
    'js_script_for_shadow_root': ' document.querySelector("msm-cookie-banner").shadowRoot',
    'cookies_banner': 'button[data-test*="decline-cookies"]',
    'sort_by': 'select[data-test-id="search-component-sort-selector"]',
    'option_rating': 'option[value*="rating"]',
    'option_popularity': 'option[value*="relevance-city"]'
}

In [3]:
def main():
    """
    Main function to execute the GYG scraping workflow.
    This function initializes the necessary managers, loads the links from the link file,
    and orchestrates the scraping and uploading processes.
    """
    try:
        # Initialize site and file manager
        site = "GYG"
        file_manager = common_functions.FilePathManager(site, "NA")  # 'NA' can be a default city or placeholder
        logger = common_functions.LoggerManager(file_manager)
        
        logger.logger_info.info(f"Starting scraping process for site: {site}")

        # Load all links and categories from the link file
        link_file_path = file_manager.get_file_paths()['link_file']
        if not os.path.exists(link_file_path):
            logger.logger_err.error(f"Link file '{link_file_path}' does not exist. Exiting.")
            return
        
        df_links = pd.read_csv(link_file_path)
        logger.logger_info.info(f"Loaded {len(df_links)} links from '{link_file_path}'.")

        # Initialize the scraper with the file manager and logger
        scraper = common_functions.ScraperGYG("N/A", "N/A", css_selectors, file_manager, logger)
        
        # Execute the daily scraping run with the loaded links
        while True:
            try:
                result = scraper.daily_run_gyg(df_links=df_links)
            except Exception as e:
                scraper.handle_error_and_rerun(e)
                logger.logger_err.error("An error occurred during the scraping process.")
        
            if result == "Done":
                break
        
        
        blob_uploader = common_functions.AzureBlobUploader(file_manager, logger)
        # After scraping all links, proceed to upload the consolidated Excel file to Azure
        try:
            blob_uploader.upload_excel_to_azure_storage_account()
            logger.logger_info.info("Uploaded the consolidated Excel file to Azure Blob Storage (raw container).")
        except Exception as e:
            scraper.handle_error_and_rerun(e)
            logger.logger_err.error("Failed to upload the Excel file to Azure Blob Storage (raw container).")
        
        # Transform the Excel file and upload the refined version to Azure

        # Initialize the AzureBlobUploader with storage account details

        try:
            blob_uploader.transform_upload_to_refined()
            logger.logger_info.info("Transformed and uploaded the refined Excel file to Azure Blob Storage (refined container).")
        except Exception as e:
            scraper.handle_error_and_rerun(e)
            logger.logger_err.error("Failed to transform and upload the refined Excel file to Azure Blob Storage.")
        
        logger.logger_done.info("All scraping and uploading tasks completed successfully.")
    
    except Exception as e:
        # Catch any unforeseen errors in the main workflow
        logging.basicConfig(level=logging.ERROR)
        logging.error(f"An unexpected error occurred in the main workflow: {e}")
        logging.error(traceback.format_exc())
    if 'backup' in os.getcwd():
        script_name = 'Viator_daily.py'

        check_if_viator_running = Azure_stopVM.check_if_script_is_running(script_name)
        if check_if_viator_running:
            logger.logger_done.info(f"{script_name} is currently running.")
        else:
            logger.logger_done.info(f"{script_name} is not running. Stoping VM")
            Azure_stopVM.stop_vm()

if __name__ == "__main__":
    main()
    



2024-10-02 09:51:14,609 - Info_logger - INFO - Starting scraping process for site: GYG
2024-10-02 09:51:14,629 - Info_logger - INFO - Loaded 1913 links from 'G:/.shortcut-targets-by-id/1ER8hilqZ2TuX2C34R3SMAtd1Xbk94LE2/MyOTAs/Baza Excel/Resource/GYG_links.csv'.
2024-10-02 09:51:14,630 - Info_logger - INFO - Initializing the Chrome driver
2024-10-02 09:51:15,829 - Info_logger - INFO - Successfully initiated Scraper for city: N/A
2024-10-02 09:51:15,830 - Info_logger - INFO - @@@@@@@@@@@@@@@@@@ 2024-10-02 -- {'output': 'G:/.shortcut-targets-by-id/1ER8hilqZ2TuX2C34R3SMAtd1Xbk94LE2/MyOTAs/Baza Excel/GYG/Daily', 'archive_folder': 'G:/.shortcut-targets-by-id/1ER8hilqZ2TuX2C34R3SMAtd1Xbk94LE2/MyOTAs/Baza Excel/GYG/Daily/Archive', 'file_path_done': 'G:/.shortcut-targets-by-id/1ER8hilqZ2TuX2C34R3SMAtd1Xbk94LE2/MyOTAs/Baza Excel/GYG/Daily/2024-10-02-DONE-GYG.csv', 'file_path_done_city': 'G:/.shortcut-targets-by-id/1ER8hilqZ2TuX2C34R3SMAtd1Xbk94LE2/MyOTAs/Baza Excel/GYG/Daily/2024-10-02-NA-GYG.cs

In [None]:
# Initialize site and file manager
site = "GYG"
file_manager = common_functions.FilePathManager(site, "NA")  # 'NA' can be a default city or placeholder
logger = common_functions.LoggerManager(file_manager)

logger.logger_info.info(f"Starting scraping process for site: {site}")

# Load all links and categories from the link file
link_file_path = file_manager.get_file_paths()['link_file']
if not os.path.exists(link_file_path):
    logger.logger_err.error(f"Link file '{link_file_path}' does not exist. Exiting.")
    

df_links = pd.read_csv(link_file_path)
logger.logger_info.info(f"Loaded {len(df_links)} links from '{link_file_path}'.")

# Initialize the scraper with the file manager and logger
scraper = GYG_Scraper(file_manager, logger)

In [5]:
# scraper.driver.get()

In [None]:
# exclude_sheets = ['Sheet1', 'Data', 'Re-Run', 'DONE']
# excel_data = pd.read_excel(file_manager.get_file_paths()['file_path_output'], sheet_name=None)
# for sheet_name, df in excel_data.items():
#     if sheet_name in exclude_sheets:
#         continue
#     # Read the Excel file into a Pandas DataFrame
#     # Check 'Data zestawienia' for valid date formats
#     df['Data zestawienia'] = df['Data zestawienia'].astype(str)

#     # Filter rows where 'Data zestawienia' does not have a valid date
#     invalid_rows = df[~df['Data zestawienia'].apply(is_valid_date)]

#     # Log sheet name and number of invalid rows if found
#     if not invalid_rows.empty:
#         scraper.logger.logger_err.error(f"Sheet {sheet_name} has {len(invalid_rows)} invalid date entries in 'Data zestawienia' column.")
#         raise ValueError(f"Sheet {sheet_name} has {len(invalid_rows)} invalid date entries in 'Data zestawienia' column.")
    

#     # Convert 'Data zestawienia' to YYYY-MM-DD format if valid
#     df['Data zestawienia'] = pd.to_datetime(df['Data zestawienia']).dt.strftime('%Y-%m-%d')

#     # Transform the DataFrame (add your transformation logic here)
#     df['Data zestawienia'] = df['Data zestawienia'].astype('str')
#     df['IloscOpini'] = df['IloscOpini'].astype(str)
#     df['IloscOpini'] = df['IloscOpini'].fillna(0)
#     df['IloscOpini'] = df['IloscOpini'].str.replace('(', '').str.replace(')','')
#     df['IloscOpini'] = df['IloscOpini'].apply(lambda x: int(float(x.replace('K', '')) * 1000) if isinstance(x, str) and 'K' in x else x)

#     df['Opinia'] = df['Opinia'].astype(str)
#     df['Opinia'] = df['Opinia'].fillna('N/A')
#     df['Opinia'] = df['Opinia'].map(lambda x: x.replace("NEW", '') if isinstance(x, str) else x)

#     df = df[df['Tytul'] != 'Tytul']
#     df = df[df['Data zestawienia'] != 'Data zestawienia']
#     df = df[df['Data zestawienia'].str.len() > 4]

#     df['Cena'] = df['Cena'].str.lower()
#     df['Cena'] = df['Cena'].map(lambda x: x.split('from')[-1] if isinstance(x, str) and 'from' in x else x)
#     df['Cena'] = df['Cena'].apply(lambda x: str(x).replace('€', '').replace('$', '').replace('£', '').strip() if isinstance(x, str) else x)
#     df['Cena'] = df['Cena'].map(lambda x: x.split('per person')[0] if isinstance(x, str) and 'per person' in x.lower() else x)
#     df['Cena'] = df['Cena'].map(lambda x: x.split('per group')[0] if isinstance(x, str) and 'per group' in x.lower() else x)

#     df['Przecena'] = df['Przecena'].apply(lambda x: str(x).replace('€', '').replace('$', '').replace('£', '').strip() if isinstance(x, str) else x)
#     df['Przecena'] = df['Przecena'].map(lambda x: x.split('per person')[0] if isinstance(x, str) and 'per person' in x.lower() else x)
#     df['Przecena'] = df['Przecena'].map(lambda x: x.split('per group')[0] if isinstance(x, str) and 'per group' in x.lower() else x)


#     # Apply str.replace only if the value is a string
#     df