In [172]:
# Import các thư viện cần thiết
from datetime import datetime
import os
import re
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from webdriver_manager.chrome import ChromeDriverManager

# ***THAY ĐỔI***: Cập nhật các constants từ config
ALLOWED_ELEMENT_TYPES = {
    "calendar__cell": "date",
    "calendar__cell calendar__date": "date",
    "calendar__cell calendar__time": "time",
    "calendar__cell calendar__currency": "currency",
    "calendar__cell calendar__impact": "impact",
    "calendar__cell calendar__event event": "event"
}

EXCLUDED_ELEMENT_TYPES = [
    "calendar__cell calendar__graph"
]

ICON_COLOR_MAP = {
    "icon icon--ff-impact-yel": "yellow",
    "icon icon--ff-impact-ora": "orange",
    "icon icon--ff-impact-red": "red",
    "icon icon--ff-impact-gra": "gray"
}

# ***THAY ĐỔI***: Bổ sung các mã tiền tệ đầy đủ
ALLOWED_CURRENCY_CODES = ['AUD', 'CAD', 'CHF', 'CNY', 'EUR', 'GBP', 'JPY', 'NZD', 'USD']

# ***THAY ĐỔI***: Màu sắc tác động (impact) mà bạn muốn cào
ALLOWED_IMPACT_COLORS = ['red', 'orange', 'gray']

# Định nghĩa các hàm tiện ích từ utils.py
def read_json(path):
    """
    Read JSON data from a file.
    Args: path (str): The path to the JSON file.
    Returns: dict: The loaded JSON data.
    """
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def contains_day_or_month(text):
    """
    Check if the given text contains a day of the week or a month.

    Args:
        text (str): The input text to check.

    Returns:
        tuple: A tuple containing a boolean indicating whether a match was found,
        and the matched text (day or month) if found.
    """

    # Regular expressions for days of the week and months
    days_of_week = r'\b(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b'
    months = r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b'
    pattern = f'({days_of_week}|{months})'

    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None

    matched_text = match.group(0)
    if re.match(days_of_week, matched_text, re.IGNORECASE):
        return True, matched_text

def find_pattern_category(text):
    """
    Find the category of a specific pattern within the given text.

    Args:
        text (str): The input text to analyze.

    Returns:
        tuple: A tuple containing a boolean indicating whether a match was found,
        the category of the matched pattern, and the matched text.
    """

    # Regular expressions for different patterns
    time_pattern = r'\d{1,2}:\d{2}(am|pm)'
    day_pattern = r'Day\s+\d+'
    date_range_pattern = r'\d{1,2}(st|nd|rd|th)\s*-\s*\d{1,2}(st|nd|rd|th)'
    tentative_pattern = r'\bTentative\b'
    pattern = f'({time_pattern}|{day_pattern}|{date_range_pattern}|{tentative_pattern})'
    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None, None

    matched_text = match.group(0)
    if re.match(time_pattern, matched_text, re.IGNORECASE):
        category = "time"
    elif re.match(day_pattern, matched_text, re.IGNORECASE):
        category = "day_reference"
    elif re.match(date_range_pattern, matched_text, re.IGNORECASE):
        category = "date_range"
    elif re.match(tentative_pattern, matched_text, re.IGNORECASE):
        category = "tentative"
    else:
        category = "Unknown"
    return True, category, matched_text

def reformat_scraped_data(data, month):
    """
    Reformat scraped data and save it as a DataFrame and a CSV file.

    Args:
        data (list): The scraped data as a list of lists.
        month (str): The month for naming the output CSV file.

    Returns:
        pd.DataFrame: The reformatted data as a DataFrame.
    """
    current_date = ''
    current_time = ''
    structured_rows = []

    for row in data:
        if len(row) == 1 or len(row) == 5:
            match, day = contains_day_or_month(row[0])
            if match:
                current_date = row[0].replace(day, "").replace("\n", "")
        if len(row) == 4:
            current_time = row[0]

        if len(row) == 5:
            current_time = row[1]

        if len(row) > 1:
            event = row[-1]
            impact = row[-2]
            currency = row[-3]

            # ***THAY ĐỔI***: Bỏ qua các mã tiền tệ không có trong ALLOWED_CURRENCY_CODES
            if currency not in ALLOWED_CURRENCY_CODES:
                continue

            # ***THAY ĐỔI***: Bỏ qua các tác động (impact) không có trong ALLOWED_IMPACT_COLORS
            if impact not in ALLOWED_IMPACT_COLORS:
                continue

            structured_rows.append([current_date, current_time, currency, impact, event])

    df = pd.DataFrame(structured_rows, columns=['date', 'time', 'currency', 'impact', 'event'])

    # ***THAY ĐỔI***: Đường dẫn lưu vào thư mục "bronze"
    output_dir = "/Users/datpro/Documents/gitdatpro/ff-transform-data/data/bronze/monthly"
    os.makedirs(output_dir, exist_ok=True)

    # Lấy giá trị năm hiện tại
    year = datetime.now().strftime("%Y")

    # Lưu file với định dạng "year-month_news.csv"
    output_path = os.path.join(output_dir, f"{year}-{month}_news.csv")
    df.to_csv(output_path, index=False)

    return df

# Khởi tạo trình điều khiển Chrome và cào dữ liệu từ trang web
try:
    driver = webdriver.Chrome()
except:
    print("AF: No Chrome webdriver installed")
    driver = webdriver.Chrome(ChromeDriverManager().install())

driver.get("https://www.forexfactory.com/calendar?month=this")

month = datetime.now().strftime("%B")

table = driver.find_element(By.CLASS_NAME, "calendar__table")

data = []
previous_row_count = 0

# Scroll down to the end of the page
while True:
    # Record the current scroll position
    before_scroll = driver.execute_script("return window.pageYOffset;")

    # Scroll down a fixed amount
    driver.execute_script("window.scrollTo(0, window.pageYOffset + 500);")

    # Wait for a short moment to allow content to load
    time.sleep(2)

    # Record the new scroll position
    after_scroll = driver.execute_script("return window.pageYOffset;")

    # If the scroll position hasn't changed, we've reached the end of the page
    if before_scroll == after_scroll:
        break

# Now that we've scrolled to the end, collect the data
for row in table.find_elements(By.TAG_NAME, "tr"):
    row_data = []
    for element in row.find_elements(By.TAG_NAME, "td"):
        class_name = element.get_attribute('class')
        if class_name in ALLOWED_ELEMENT_TYPES:
            if element.text:
                row_data.append(element.text)
            elif "calendar__impact" in class_name:
                impact_elements = element.find_elements(By.TAG_NAME, "span")
                for impact in impact_elements:
                    impact_class = impact.get_attribute("class")
                    color = ICON_COLOR_MAP[impact_class]
                if color:
                    row_data.append(color)
                else:
                    row_data.append("impact")

    if len(row_data):
        data.append(row_data)

# Gọi hàm reformat_scraped_data để xử lý và lưu dữ liệu
df = reformat_scraped_data(data, month)

# Hiển thị DataFrame kết quả
df.head()


Unnamed: 0,date,time,currency,impact,event
0,Aug 1,1:00am,USD,red,Federal Funds Rate
1,Aug 1,1:00am,USD,red,FOMC Statement
2,Aug 1,1:30am,USD,red,FOMC Press Conference
3,Aug 1,All Day,CHF,gray,Bank Holiday
4,Aug 1,6:00pm,GBP,red,BOE Monetary Policy Report


In [173]:
data

[['Thu\nAug 1', '1:00am', 'USD', 'red', 'Federal Funds Rate'],
 ['USD', 'red', 'FOMC Statement'],
 ['1:30am', 'USD', 'red', 'FOMC Press Conference'],
 ['7:30am', 'JPY', 'yellow', 'Final Manufacturing PMI'],
 ['8:30am', 'AUD', 'yellow', 'Goods Trade Balance'],
 ['AUD', 'yellow', 'Import Prices q/q'],
 ['8:45am', 'CNY', 'yellow', 'Caixin Manufacturing PMI'],
 ['All Day', 'CHF', 'gray', 'Bank Holiday'],
 ['1:00pm', 'GBP', 'yellow', 'Nationwide HPI m/m'],
 ['1:30pm', 'AUD', 'yellow', 'Commodity Prices y/y'],
 ['2:15pm', 'EUR', 'yellow', 'Spanish Manufacturing PMI'],
 ['2:45pm', 'EUR', 'yellow', 'Italian Manufacturing PMI'],
 ['2:50pm', 'EUR', 'yellow', 'French Final Manufacturing PMI'],
 ['2:55pm', 'EUR', 'yellow', 'German Final Manufacturing PMI'],
 ['3:00pm', 'EUR', 'yellow', 'ECB Economic Bulletin'],
 ['EUR', 'yellow', 'Final Manufacturing PMI'],
 ['EUR', 'yellow', 'Italian Monthly Unemployment Rate'],
 ['3:30pm', 'GBP', 'yellow', 'Final Manufacturing PMI'],
 ['3:50pm', 'EUR', 'yellow',

In [98]:
# Running the function on the mocked data
df_result = reformat_scraped_data(data, "August")

df_result.head()

Unnamed: 0,date,time,currency,impact,event
0,Aug 1,1:00am,USD,red,Federal Funds Rate
1,Aug 1,1:00am,USD,red,FOMC Statement
2,Aug 1,1:30am,USD,red,FOMC Press Conference
3,Aug 1,All Day,CHF,gray,Bank Holiday
4,Aug 1,6:00pm,GBP,red,BOE Monetary Policy Report


In [94]:
print(class_name)  # In ra class_name để xác minh nó có khớp với ALLOWED_ELEMENT_TYPES hay không


calendar__cell


In [95]:
print(row_data)  # Kiểm tra từng hàng dữ liệu để đảm bảo dữ liệu được thu thập đúng cách


['Sat\nAug 31']


In [96]:
print(structured_rows)  # Kiểm tra danh sách dữ liệu trước khi tạo DataFrame


NameError: name 'structured_rows' is not defined

In [97]:
print(currency, impact)  # Kiểm tra giá trị tiền tệ và tác động trước khi lọc


NameError: name 'currency' is not defined

# Test 2 : ok code chay duoc voi data va lay du data

In [184]:
# Import các thư viện cần thiết
from datetime import datetime
import os
import re
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from webdriver_manager.chrome import ChromeDriverManager

# ***THAY ĐỔI***: Cập nhật các constants từ config
ALLOWED_ELEMENT_TYPES = {
    "calendar__cell": "date",
    "calendar__cell calendar__date": "date",
    "calendar__cell calendar__time": "time",
    "calendar__cell calendar__currency": "currency",
    "calendar__cell calendar__impact": "impact",
    "calendar__cell calendar__event event": "event",
    # Thêm vào ba mục mới
    "calendar__cell calendar__actual": "actual",
    "calendar__cell calendar__forecast": "forecast",
    "calendar__cell calendar__previous": "previous"
}

EXCLUDED_ELEMENT_TYPES = [
    "calendar__cell calendar__graph"
]

ICON_COLOR_MAP = {
    "icon icon--ff-impact-yel": "yellow",
    "icon icon--ff-impact-ora": "orange",
    "icon icon--ff-impact-red": "red",
    "icon icon--ff-impact-gra": "gray"
}

# ***THAY ĐỔI***: Bổ sung các mã tiền tệ đầy đủ
ALLOWED_CURRENCY_CODES = ['AUD', 'CAD', 'CHF', 'CNY', 'EUR', 'GBP', 'JPY', 'NZD', 'USD']

# ***THAY ĐỔI***: Màu sắc tác động (impact) mà bạn muốn cào
ALLOWED_IMPACT_COLORS = ['red', 'orange', 'gray']

# Định nghĩa các hàm tiện ích từ utils.py
def read_json(path):
    """
    Read JSON data from a file.
    Args: path (str): The path to the JSON file.
    Returns: dict: The loaded JSON data.
    """
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def contains_day_or_month(text):
    """
    Check if the given text contains a day of the week or a month.

    Args:
        text (str): The input text to check.

    Returns:
        tuple: A tuple containing a boolean indicating whether a match was found,
        and the matched text (day or month) if found.
    """

    # Regular expressions for days of the week and months
    days_of_week = r'\b(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b'
    months = r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b'
    pattern = f'({days_of_week}|{months})'

    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None

    matched_text = match.group(0)
    if re.match(days_of_week, matched_text, re.IGNORECASE):
        return True, matched_text

def find_pattern_category(text):
    """
    Find the category of a specific pattern within the given text.

    Args:
        text (str): The input text to analyze.

    Returns:
        tuple: A tuple containing a boolean indicating whether a match was found,
        the category of the matched pattern, and the matched text.
    """

    # Regular expressions for different patterns
    time_pattern = r'\d{1,2}:\d{2}(am|pm)'
    day_pattern = r'Day\s+\d+'
    date_range_pattern = r'\d{1,2}(st|nd|rd|th)\s*-\s*\d{1,2}(st|nd|rd|th)'
    tentative_pattern = r'\bTentative\b'
    pattern = f'({time_pattern}|{day_pattern}|{date_range_pattern}|{tentative_pattern})'
    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None, None

    matched_text = match.group(0)
    if re.match(time_pattern, matched_text, re.IGNORECASE):
        category = "time"
    elif re.match(day_pattern, matched_text, re.IGNORECASE):
        category = "day_reference"
    elif re.match(date_range_pattern, matched_text, re.IGNORECASE):
        category = "date_range"
    elif re.match(tentative_pattern, matched_text, re.IGNORECASE):
        category = "tentative"
    else:
        category = "Unknown"
    return True, category, matched_text

def reformat_scraped_data(data, month):
    current_date = ''
    current_time = ''
    structured_rows = []

    for row in data:
        if len(row) == 8:
            match, day = contains_day_or_month(row[0])
            if match:
                current_date = row[0].replace(day, "").replace("\n", "")
            current_time = row[1]
            currency = row[2]
            impact = row[3]
            event = row[4]
            actual = row[5] or "N/A"
            forecast = row[6] or "N/A"
            previous = row[7] or "N/A"
        elif len(row) == 7:
            current_time = row[0]
            currency = row[1]
            impact = row[2]
            event = row[3]
            actual = row[4] or "N/A"
            forecast = row[5] or "N/A"
            previous = row[6] or "N/A"
        elif len(row) == 6:
            currency = row[0]
            impact = row[1]
            event = row[2]
            actual = row[3] or "N/A"
            forecast = row[4] or "N/A"
            previous = row[5] or "N/A"

        if currency not in ALLOWED_CURRENCY_CODES:
            continue
        if impact not in ALLOWED_IMPACT_COLORS:
            continue

        structured_rows.append([current_date, current_time, currency, impact, event, actual, forecast, previous])

    df = pd.DataFrame(structured_rows, columns=['date', 'time', 'currency', 'impact', 'event', 'actual', 'forecast', 'previous'])

    return df

# Khởi tạo trình điều khiển Chrome và cào dữ liệu từ trang web
try:
    driver = webdriver.Chrome()
except:
    print("AF: No Chrome webdriver installed")
    driver = webdriver.Chrome(ChromeDriverManager().install())

driver.get("https://www.forexfactory.com/calendar?month=this")

month = datetime.now().strftime("%B")

table = driver.find_element(By.CLASS_NAME, "calendar__table")

data = []
previous_row_count = 0

# Scroll down to the end of the page
while True:
    # Record the current scroll position
    before_scroll = driver.execute_script("return window.pageYOffset;")

    # Scroll down a fixed amount
    driver.execute_script("window.scrollTo(0, window.pageYOffset + 500);")

    # Wait for a short moment to allow content to load
    time.sleep(2)

    # Record the new scroll position
    after_scroll = driver.execute_script("return window.pageYOffset;")

    # If the scroll position hasn't changed, we've reached the end of the page
    if before_scroll == after_scroll:
        break

# Now that we've scrolled to the end, collect the data
for row in table.find_elements(By.TAG_NAME, "tr"):
    row_data = []
    for element in row.find_elements(By.TAG_NAME, "td"):
        class_name = element.get_attribute('class')
        if class_name in ALLOWED_ELEMENT_TYPES:
            if element.text:
                row_data.append(element.text)
            elif "calendar__impact" in class_name:
                impact_elements = element.find_elements(By.TAG_NAME, "span")
                for impact in impact_elements:
                    impact_class = impact.get_attribute("class")
                    color = ICON_COLOR_MAP[impact_class]
                if color:
                    row_data.append(color)
                else:
                    row_data.append("impact")
            # Handle other types (actual, forecast, previous)
            elif class_name == "calendar__cell calendar__actual":
                row_data.append(element.text or "N/A")
            elif class_name == "calendar__cell calendar__forecast":
                row_data.append(element.text or "N/A")
            elif class_name == "calendar__cell calendar__previous":
                row_data.append(element.text or "N/A")

    if len(row_data):
        data.append(row_data)

# Gọi hàm reformat_scraped_data để xử lý và lưu dữ liệu
df = reformat_scraped_data(data, month)

# Hiển thị DataFrame kết quả
df


Unnamed: 0,date,time,currency,impact,event,actual,forecast,previous
0,Aug 1,1:00am,USD,red,Federal Funds Rate,5.50%,5.50%,5.50%
1,Aug 1,1:00am,USD,red,FOMC Statement,,,
2,Aug 1,1:30am,USD,red,FOMC Press Conference,,,
3,Aug 1,All Day,CHF,gray,Bank Holiday,,,
4,Aug 1,6:00pm,GBP,red,BOE Monetary Policy Report,,,
...,...,...,...,...,...,...,...,...
123,Aug 30,4:00pm,EUR,red,CPI Flash Estimate y/y,,,
124,Aug 30,7:30pm,CAD,red,GDP m/m,,,
125,Aug 30,7:30pm,USD,red,Core PCE Price Index m/m,,,
126,Aug 30,8:45pm,USD,orange,Chicago PMI,,,


In [183]:
data

[['Thu\nAug 1',
  '1:00am',
  'USD',
  'red',
  'Federal Funds Rate',
  '5.50%',
  '5.50%',
  '5.50%'],
 ['USD', 'red', 'FOMC Statement', 'N/A', 'N/A', 'N/A'],
 ['1:30am', 'USD', 'red', 'FOMC Press Conference', 'N/A', 'N/A', 'N/A'],
 ['7:30am',
  'JPY',
  'yellow',
  'Final Manufacturing PMI',
  '49.1',
  '49.2',
  '49.2'],
 ['8:30am', 'AUD', 'yellow', 'Goods Trade Balance', '5.59B', '5.08B', '5.05B'],
 ['AUD', 'yellow', 'Import Prices q/q', '1.0%', '-0.9%', '-1.8%'],
 ['8:45am',
  'CNY',
  'yellow',
  'Caixin Manufacturing PMI',
  '49.8',
  '51.4',
  '51.8'],
 ['All Day', 'CHF', 'gray', 'Bank Holiday', 'N/A', 'N/A', 'N/A'],
 ['1:00pm', 'GBP', 'yellow', 'Nationwide HPI m/m', '0.3%', '0.1%', '0.2%'],
 ['1:30pm', 'AUD', 'yellow', 'Commodity Prices y/y', '-3.0%', 'N/A', '-3.5%'],
 ['2:15pm',
  'EUR',
  'yellow',
  'Spanish Manufacturing PMI',
  '51.0',
  '52.5',
  '52.3'],
 ['2:45pm',
  'EUR',
  'yellow',
  'Italian Manufacturing PMI',
  '47.4',
  '46.0',
  '45.7'],
 ['2:50pm',
  'EUR',
 

In [87]:
class_name

'calendar__cell'

In [90]:
print(currency, impact)

NameError: name 'currency' is not defined

In [88]:
row_data

['Sat\nAug 31']

In [89]:
structured_rows

NameError: name 'structured_rows' is not defined

In [91]:
if actual is None: print("Actual is None")
if forecast is None: print("Forecast is None")
if previous is None: print("Previous is None")


Actual is None
Forecast is None
Previous is None


Test 3 thu nghiem voi thang khac

In [187]:
# Import các thư viện cần thiết
from datetime import datetime
import os
import re
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from webdriver_manager.chrome import ChromeDriverManager

# ***THAY ĐỔI***: Cập nhật các constants từ config
ALLOWED_ELEMENT_TYPES = {
    "calendar__cell": "date",
    "calendar__cell calendar__date": "date",
    "calendar__cell calendar__time": "time",
    "calendar__cell calendar__currency": "currency",
    "calendar__cell calendar__impact": "impact",
    "calendar__cell calendar__event event": "event",
    # Thêm vào ba mục mới
    "calendar__cell calendar__actual": "actual",
    "calendar__cell calendar__forecast": "forecast",
    "calendar__cell calendar__previous": "previous"
}

EXCLUDED_ELEMENT_TYPES = [
    "calendar__cell calendar__graph"
]

ICON_COLOR_MAP = {
    "icon icon--ff-impact-yel": "yellow",
    "icon icon--ff-impact-ora": "orange",
    "icon icon--ff-impact-red": "red",
    "icon icon--ff-impact-gra": "gray"
}

# ***THAY ĐỔI***: Bổ sung các mã tiền tệ đầy đủ
ALLOWED_CURRENCY_CODES = ['AUD', 'CAD', 'CHF', 'CNY', 'EUR', 'GBP', 'JPY', 'NZD', 'USD']

# ***THAY ĐỔI***: Màu sắc tác động (impact) mà bạn muốn cào
ALLOWED_IMPACT_COLORS = ['red', 'orange', 'yellow', 'gray']

# Định nghĩa các hàm tiện ích từ utils.py
def read_json(path):
    """
    Read JSON data from a file.
    Args: path (str): The path to the JSON file.
    Returns: dict: The loaded JSON data.
    """
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def contains_day_or_month(text):
    """
    Check if the given text contains a day of the week or a month.

    Args:
        text (str): The input text to check.

    Returns:
        tuple: A tuple containing a boolean indicating whether a match was found,
        and the matched text (day or month) if found.
    """

    # Regular expressions for days of the week and months
    days_of_week = r'\b(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b'
    months = r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b'
    pattern = f'({days_of_week}|{months})'

    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None

    matched_text = match.group(0)
    if re.match(days_of_week, matched_text, re.IGNORECASE):
        return True, matched_text

def find_pattern_category(text):
    """
    Find the category of a specific pattern within the given text.

    Args:
        text (str): The input text to analyze.

    Returns:
        tuple: A tuple containing a boolean indicating whether a match was found,
        the category of the matched pattern, and the matched text.
    """

    # Regular expressions for different patterns
    time_pattern = r'\d{1,2}:\d{2}(am|pm)'
    day_pattern = r'Day\s+\d+'
    date_range_pattern = r'\d{1,2}(st|nd|rd|th)\s*-\s*\d{1,2}(st|nd|rd|th)'
    tentative_pattern = r'\bTentative\b'
    pattern = f'({time_pattern}|{day_pattern}|{date_range_pattern}|{tentative_pattern})'
    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None, None

    matched_text = match.group(0)
    if re.match(time_pattern, matched_text, re.IGNORECASE):
        category = "time"
    elif re.match(day_pattern, matched_text, re.IGNORECASE):
        category = "day_reference"
    elif re.match(date_range_pattern, matched_text, re.IGNORECASE):
        category = "date_range"
    elif re.match(tentative_pattern, matched_text, re.IGNORECASE):
        category = "tentative"
    else:
        category = "Unknown"
    return True, category, matched_text

def reformat_scraped_data(data, month):
    current_date = ''
    current_time = ''
    structured_rows = []

    for row in data:
        # Đặt giá trị mặc định cho currency và impact
        currency = None
        impact = None
        event = None
        actual = "N/A"
        forecast = "N/A"
        previous = "N/A"
        
        if len(row) == 8:
            match, day = contains_day_or_month(row[0])
            if match:
                current_date = row[0].replace(day, "").replace("\n", "")
            current_time = row[1]
            currency = row[2]
            impact = row[3]
            event = row[4]
            actual = row[5] or "N/A"
            forecast = row[6] or "N/A"
            previous = row[7] or "N/A"
        elif len(row) == 7:
            current_time = row[0]
            currency = row[1]
            impact = row[2]
            event = row[3]
            actual = row[4] or "N/A"
            forecast = row[5] or "N/A"
            previous = row[6] or "N/A"
        elif len(row) == 6:
            currency = row[0]
            impact = row[1]
            event = row[2]
            actual = row[3] or "N/A"
            forecast = row[4] or "N/A"
            previous = row[5] or "N/A"

        if currency not in ALLOWED_CURRENCY_CODES:
            continue
        if impact not in ALLOWED_IMPACT_COLORS:
            continue

        structured_rows.append([current_date, current_time, currency, impact, event, actual, forecast, previous])

    df = pd.DataFrame(structured_rows, columns=['date', 'time', 'currency', 'impact', 'event', 'actual', 'forecast', 'previous'])

    return df

# Khởi tạo trình điều khiển Chrome và cào dữ liệu từ trang web
try:
    driver = webdriver.Chrome()
except:
    print("AF: No Chrome webdriver installed")
    driver = webdriver.Chrome(ChromeDriverManager().install())

driver.get("https://www.forexfactory.com/calendar?month=jan.2023")

month = datetime.now().strftime("%B")

table = driver.find_element(By.CLASS_NAME, "calendar__table")

data = []
previous_row_count = 0

# Scroll down to the end of the page
while True:
    # Record the current scroll position
    before_scroll = driver.execute_script("return window.pageYOffset;")

    # Scroll down a fixed amount
    driver.execute_script("window.scrollTo(0, window.pageYOffset + 500);")

    # Wait for a short moment to allow content to load
    time.sleep(2)

    # Record the new scroll position
    after_scroll = driver.execute_script("return window.pageYOffset;")

    # If the scroll position hasn't changed, we've reached the end of the page
    if before_scroll == after_scroll:
        break

# Now that we've scrolled to the end, collect the data
for row in table.find_elements(By.TAG_NAME, "tr"):
    row_data = []
    for element in row.find_elements(By.TAG_NAME, "td"):
        class_name = element.get_attribute('class')
        if class_name in ALLOWED_ELEMENT_TYPES:
            if element.text:
                row_data.append(element.text)
            elif "calendar__impact" in class_name:
                impact_elements = element.find_elements(By.TAG_NAME, "span")
                for impact in impact_elements:
                    impact_class = impact.get_attribute("class")
                    color = ICON_COLOR_MAP[impact_class]
                if color:
                    row_data.append(color)
                else:
                    row_data.append("impact")
            # Handle other types (actual, forecast, previous)
            elif class_name == "calendar__cell calendar__actual":
                row_data.append(element.text or "N/A")
            elif class_name == "calendar__cell calendar__forecast":
                row_data.append(element.text or "N/A")
            elif class_name == "calendar__cell calendar__previous":
                row_data.append(element.text or "N/A")

    if len(row_data):
        data.append(row_data)

# Gọi hàm reformat_scraped_data để xử lý và lưu dữ liệu
df = reformat_scraped_data(data, month)

# Hiển thị DataFrame kết quả
df


Unnamed: 0,date,time,currency,impact,event,actual,forecast,previous
0,Jan 2,All Day,NZD,gray,Bank Holiday,,,
1,Jan 2,All Day,AUD,gray,Bank Holiday,,,
2,Jan 2,All Day,JPY,gray,Bank Holiday,,,
3,Jan 2,All Day,CNY,gray,Bank Holiday,,,
4,Jan 2,All Day,CHF,gray,Bank Holiday,,,
...,...,...,...,...,...,...,...,...
364,Jan 31,8:30pm,USD,yellow,Employment Cost Index q/q,1.0%,1.1%,1.2%
365,Jan 31,9:00pm,USD,yellow,HPI m/m,-0.1%,-0.4%,0.0%
366,Jan 31,9:00pm,USD,yellow,S&P/CS Composite-20 HPI y/y,6.8%,6.8%,8.6%
367,Jan 31,9:45pm,USD,orange,Chicago PMI,44.3,45.1,44.9


In [151]:
# Mocked data similar to the one described by the user for testing
data_1 = [
    ['Thu\nAug 1', '1:00am', 'USD', 'red', 'Federal Funds Rate', '5.50%', '5.50%', '5.50%'],
    ['USD', 'red', 'FOMC Statement', 'N/A', 'N/A', 'N/A'],
    ['1:30am', 'USD', 'red', 'FOMC Press Conference', 'N/A', 'N/A', 'N/A'],
    ['7:30am', 'JPY', 'yellow', 'Final Manufacturing PMI', '49.1', '49.2', '49.2'],
    ['8:30am', 'AUD', 'yellow', 'Goods Trade Balance', '5.59B', '5.08B', '5.05B'],
    ['AUD', 'yellow', 'Import Prices q/q', '1.0%', '-0.9%', '-1.8%'],
    ['8:45am', 'CNY', 'yellow', 'Caixin Manufacturing PMI', '49.8', '51.4', '51.8'],
    ['All Day', 'CHF', 'gray', 'Bank Holiday', 'N/A', 'N/A', 'N/A'],
    ['1:00pm', 'GBP', 'yellow', 'Nationwide HPI m/m', '0.3%', '0.1%', '0.2%'],
    ['1:30pm', 'AUD', 'yellow', 'Commodity Prices y/y', '-3.0%', 'N/A', '-3.5%']
]


# def reformat_scraped_data(data, month):
#     """
#     Reformat scraped data and save it as a DataFrame and a CSV file.
# 
#     Args:
#         data (list): The scraped data as a list of lists.
#         month (str): The month for naming the output CSV file.
# 
#     Returns:
#         pd.DataFrame: The reformatted data as a DataFrame.
#     """
#     current_date = ''
#     current_time = ''
#     structured_rows = []
# 
#     for row in data:
#         if len(row) == 1 or len(row) == 5:
#             match, day = contains_day_or_month(row[0])
#             if match:
#                 current_date = row[0].replace(day, "").replace("\n", "")
#         if len(row) == 4:
#             current_time = row[0]
# 
#         if len(row) == 5:
#             current_time = row[1]
# 
#         if len(row) > 1:
#             event = row[-1]
#             impact = row[-2]
#             currency = row[-3]
# 
#             # ***THAY ĐỔI***: Bỏ qua các mã tiền tệ không có trong ALLOWED_CURRENCY_CODES
#             if currency not in ALLOWED_CURRENCY_CODES:
#                 continue
# 
#             # ***THAY ĐỔI***: Bỏ qua các tác động (impact) không có trong ALLOWED_IMPACT_COLORS
#             if impact not in ALLOWED_IMPACT_COLORS:
#                 continue
# 
#             structured_rows.append([current_date, current_time, currency, impact, event])
# 
#     df = pd.DataFrame(structured_rows, columns=['date', 'time', 'currency', 'impact', 'event'])
#     return df


# def contains_day_or_month(text):
#     """
#     Check if the given text contains a day of the week or a month.
# 
#     Args:
#         text (str): The input text to check.
# 
#     Returns:
#         tuple: A tuple containing a boolean indicating whether a match was found,
#         and the matched text (day or month) if found.
#     """
# 
#     # Regular expressions for days of the week and months
#     days_of_week = r'\b(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b'
#     months = r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b'
#     pattern = f'({days_of_week}|{months})'
# 
#     match = re.search(pattern, text, re.IGNORECASE)
# 
#     if not match:
#         return False, None
# 
#     matched_text = match.group(0)
#     if re.match(days_of_week, matched_text, re.IGNORECASE):
#         return True, matched_text

def contains_day_or_month(text):
    """
    Check if the given text contains a day of the week or a month.

    Args:
        text (str): The input text to check.

    Returns:
        tuple: A tuple containing a boolean indicating whether a match was found,
        and the matched text (day or month) if found.
    """

    # Regular expressions for days of the week and months
    days_of_week = r'\b(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b'
    months = r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b'
    pattern = f'({days_of_week}|{months})'

    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None

    matched_text = match.group(0)
    return True, matched_text



def reformat_scraped_data_final(data, month):
    """
    Reformat scraped data and save it as a DataFrame and a CSV file.

    Args:
        data (list): The scraped data as a list of lists.
        month (str): The month for naming the output CSV file.

    Returns:
        pd.DataFrame: The reformatted data as a DataFrame.
    """
    current_date = ''
    current_time = ''
    structured_rows = []

    for row in data:
        if len(row) == 8:  # Full row with date, time, and all columns
            match, day = contains_day_or_month(row[0])
            if match:
                current_date = row[0].replace(day, "").replace("\n", "")
            current_time = row[1]
            currency = row[2]
            impact = row[3]
            event = row[4]
            actual = row[5]
            forecast = row[6]
            previous = row[7]
        elif len(row) == 6:  # Row with currency, impact, event, and new columns
            currency = row[0]
            impact = row[1]
            event = row[2]
            actual = row[3]
            forecast = row[4]
            previous = row[5]
        elif len(row) == 7:  # Row with time, currency, impact, and new columns
            current_time = row[0]
            currency = row[1]
            impact = row[2]
            event = row[3]
            actual = row[4]
            forecast = row[5]
            previous = row[6]
            
            # Skip currencies not in ALLOWED_CURRENCY_CODES
            if currency not in ALLOWED_CURRENCY_CODES:
                continue

            # Skip impacts not in ALLOWED_IMPACT_COLORS
            if impact not in ALLOWED_IMPACT_COLORS:
                continue

            # Append the row data with the new columns included
            structured_rows.append([current_date, current_time, currency, impact, event, actual, forecast, previous])

    # Include the new columns in the DataFrame
    df = pd.DataFrame(structured_rows, columns=['date', 'time', 'currency', 'impact', 'event', 'actual', 'forecast', 'previous'])
    return df

# Running the function on the mocked data
df_result = reformat_scraped_data(data, "August")
df_result


Unnamed: 0,date,time,currency,impact,event,actual,forecast,previous


### Test data len(row)

In [154]:
# Sample data as provided
data_1 = [
    ['Thu\nAug 1',
      '1:00am',
      'USD',
      'red',
      'Federal Funds Rate',
      '5.50%',
      '5.50%',
      '5.50%'],
     ['USD', 'red', 'FOMC Statement', 'N/A', 'N/A', 'N/A'],
     ['1:30am', 'USD', 'red', 'FOMC Press Conference', 'N/A', 'N/A', 'N/A'],
     ['7:30am',
      'JPY',
      'yellow',
      'Final Manufacturing PMI',
      '49.1',
      '49.2',
      '49.2'],
     ['8:30am', 'AUD', 'yellow', 'Goods Trade Balance', '5.59B', '5.08B', '5.05B'],
     ['AUD', 'yellow', 'Import Prices q/q', '1.0%', '-0.9%', '-1.8%'],
     ['8:45am',
      'CNY',
      'yellow',
      'Caixin Manufacturing PMI',
      '49.8',
      '51.4',
      '51.8'],
     ['All Day', 'CHF', 'gray', 'Bank Holiday', 'N/A', 'N/A', 'N/A'],
     ['1:00pm', 'GBP', 'yellow', 'Nationwide HPI m/m', '0.3%', '0.1%', '0.2%'],
     ['1:30pm', 'AUD', 'yellow', 'Commodity Prices y/y', '-3.0%', 'N/A', '-3.5%'],
     ['2:15pm',
      'EUR',
      'yellow',
      'Spanish Manufacturing PMI',
      '51.0',
      '52.5',
      '52.3'],
     ['2:45pm',
      'EUR',
      'yellow',
      'Italian Manufacturing PMI',
      '47.4',
      '46.0',
      '45.7'],
     ['2:50pm',
      'EUR',
      'yellow',
      'French Final Manufacturing PMI',
      '44.0',
      '44.1',
      '44.1'],
     ['2:55pm',
      'EUR',
      'yellow',
      'German Final Manufacturing PMI',
      '43.2',
      '42.6',
      '42.6'],
     ['3:00pm', 'EUR', 'yellow', 'ECB Economic Bulletin', 'N/A', 'N/A', 'N/A'],
     ['EUR', 'yellow', 'Final Manufacturing PMI', '45.8', '45.6', '45.6'],
     ['EUR',
      'yellow',
      'Italian Monthly Unemployment Rate',
      '7.0%',
      '6.8%',
      '6.9%'],
     ['3:30pm',
      'GBP',
      'yellow',
      'Final Manufacturing PMI',
      '52.1',
      '51.8',
      '51.8'],
     ['3:50pm',
      'EUR',
      'yellow',
      'Spanish 10-y Bond Auction',
      '3.11|1.5',
      'N/A',
      '3.19|1.4'],
     ['4:00pm', 'EUR', 'yellow', 'Unemployment Rate', '6.5%', '6.4%', '6.4%'],
     ['4:02pm',
      'EUR',
      'yellow',
      'French 10-y Bond Auction',
      '3.01|2.2',
      'N/A',
      '3.23|2.4'],
     ['All Day', 'All', 'orange', 'OPEC-JMMC Meetings', 'N/A', 'N/A', 'N/A'],
     ['6:00pm', 'GBP', 'red', 'BOE Monetary Policy Report', 'N/A', 'N/A', 'N/A'],
     ['GBP', 'red', 'Monetary Policy Summary', 'N/A', 'N/A', 'N/A'],
     ['GBP', 'red', 'MPC Official Bank Rate Votes', '0-5-4', '0-6-3', '0-2-7'],
     ['GBP', 'red', 'Official Bank Rate', '5.00%', '5.00%', '5.25%'],
     ['6:30pm', 'GBP', 'red', 'BOE Gov Bailey Speaks', 'N/A', 'N/A', 'N/A'],
     ['USD', 'yellow', 'Challenger Job Cuts y/y', '9.2%', 'N/A', '19.8%'],
     ['7:30pm', 'USD', 'red', 'Unemployment Claims', '249K', '236K', '235K'],
     ['USD', 'yellow', 'Prelim Nonfarm Productivity q/q', '2.3%', '1.7%', '0.2%'],
     ['USD', 'yellow', 'Prelim Unit Labor Costs q/q', '0.9%', '1.8%', '4.0%'],
     ['8:30pm', 'CAD', 'yellow', 'Manufacturing PMI', '47.8', 'N/A', '49.3'],
     ['8:45pm',
      'USD',
      'orange',
      'Final Manufacturing PMI',
      '49.6',
      '49.5',
      '49.5'],
     ['9:00pm', 'USD', 'red', 'ISM Manufacturing PMI', '46.8', '48.8', '48.5'],
     ['USD', 'orange', 'ISM Manufacturing Prices', '52.9', '51.9', '52.1'],
     ['USD', 'yellow', 'Construction Spending m/m', '-0.3%', '0.2%', '-0.4%'],
     ['All Day',
      'USD',
      'yellow',
      'Wards Total Vehicle Sales',
      '15.8M',
      '16.1M',
      '15.3M'],
     ['9:30pm', 'USD', 'yellow', 'Natural Gas Storage', '18B', '30B', '22B'],
     ['11:15pm', 'GBP', 'orange', 'MPC Member Pill Speaks', 'N/A', 'N/A', 'N/A'],
     ['Fri\nAug 2',
      '6:50am',
      'JPY',
      'yellow',
      'Monetary Base y/y',
      '1.0%',
      '0.9%',
      '0.6%'],
     ['8:30am', 'AUD', 'yellow', 'PPI q/q', '1.0%', '1.0%', '0.9%'],
     ['1:30pm', 'CHF', 'red', 'CPI m/m', '-0.2%', '-0.2%', '0.0%']
]

# Calculate the length of each row
row_lengths = [len(row) for row in data_1]
row_lengths


[8,
 6,
 7,
 7,
 7,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 6,
 6,
 7,
 6,
 7,
 6,
 6,
 7,
 7,
 7,
 6,
 6,
 7,
 7,
 7,
 8,
 7,
 7]

In [131]:
# Sample data as provided
data_2 = [
     ['Thu\nAug 1', '1:00am', 'USD', 'red', 'Federal Funds Rate'],
     ['USD', 'red', 'FOMC Statement'],
     ['1:30am', 'USD', 'red', 'FOMC Press Conference'],
     ['7:30am', 'JPY', 'yellow', 'Final Manufacturing PMI'],
     ['8:30am', 'AUD', 'yellow', 'Goods Trade Balance'],
     ['AUD', 'yellow', 'Import Prices q/q'],
     ['8:45am', 'CNY', 'yellow', 'Caixin Manufacturing PMI'],
     ['All Day', 'CHF', 'gray', 'Bank Holiday'],
     ['1:00pm', 'GBP', 'yellow', 'Nationwide HPI m/m'],
     ['1:30pm', 'AUD', 'yellow', 'Commodity Prices y/y'],
     ['2:15pm', 'EUR', 'yellow', 'Spanish Manufacturing PMI'],
     ['2:45pm', 'EUR', 'yellow', 'Italian Manufacturing PMI'],
]

# Calculate the length of each row
row_lengths = [len(row) for row in data_2]
row_lengths


[5, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4]