## Extract erroneous scraped symbols
We use this code to extract symbols that encountered errors during the scraping process. Subsequently, we will re-scrape these symbols

In [10]:
import re
import os
import pandas as pd

def extract_symbols(log_folder_path):
    warning_symbols = set()
    info_symbols = set()

    # Regex patterns to identify WARNING and INFO logs
    warning_pattern = re.compile(r"WARNING - Not enough reports (\w+):")
    info_pattern = re.compile(r"INFO - Scraping Successfully (\w+):")

    # Loop through each log file in the folder
    for filename in os.listdir(log_folder_path):
        if filename.endswith(".log"):
            with open(os.path.join(log_folder_path, filename), 'r') as file:
                for line in file:
                    # Match WARNING logs
                    warning_match = warning_pattern.search(line)
                    if warning_match:
                        symbol = warning_match.group(1)
                        warning_symbols.add(symbol)

                    # Match INFO logs
                    info_match = info_pattern.search(line)
                    if info_match:
                        symbol = info_match.group(1)
                        info_symbols.add(symbol)

    # Exclude symbols that appear in INFO logs from WARNING symbols
    final_warning_symbols = warning_symbols - info_symbols

    # Create a DataFrame to display the results
    warning_df = pd.DataFrame(final_warning_symbols, columns=["symbol"])
    print("Symbols with warnings (excluding those in INFO logs):")
    print(warning_df)

    return warning_df



In [11]:
# Example usage
log_folder_path = "./logging"  # Replace with your log folder path
warnings_df = extract_symbols(log_folder_path)

  symbol
0    SAB
1    NVL
2    CTG


In [None]:
# warnings_df.to_csv("error_symbols.csv")

In [17]:
import json
for symbol in list(warnings_df["symbol"]):
    checkpoint_path = f"./checkpoints/checkpoint_{symbol}.json"
    checkpoint = {"current_page": 1, "last_row_index": 0}

    # Save the updated checkpoints back to the file
    with open(checkpoint_path, 'w') as f:
        json.dump(checkpoint, f)

In [21]:
len("CBTT BCTC riêng lẻ Quý IV/2023; giải trình biến động lợi nhuận Quý IV/2023 và điều chỉnh hồi tố số liệu BCTC kiểm toán năm 2022 theo Kiểm toán Nhà nước")

151