# Forex Factory Historical Data Scraper

This script is designed to manually scrape historical monthly data from the Forex Factory website. It automates the process of collecting calendar events, including actual, forecast, and previous economic indicators, for a specified month.

## Key Features:
- **Selenium WebDriver**: Automates the browser to navigate to the Forex Factory calendar page.
- **Data Extraction**: Collects and formats data including the date, time, currency, impact, event, actual, forecast, and previous values.
- **Data Saving**: Saves the formatted data to a CSV file for further analysis.

## Usage:
1. Ensure you have Python installed and set up a virtual environment.
2. Install the required libraries:
   ```bash
   pip install selenium pandas webdriver_manager
3. Update the url in the script to the desired month (e.g., https://www.forexfactory.com/calendar?month=jan.2023).
4. Run the script to scrape the data and save it to a CSV file in the specified directory.

In [None]:
# Import necessary libraries
from datetime import datetime
import os
import re
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from webdriver_manager.chrome import ChromeDriverManager

In [21]:
# Define allowed element types for scraping
ALLOWED_ELEMENT_TYPES = {
    "calendar__cell": "date",
    "calendar__cell calendar__date": "date",
    "calendar__cell calendar__time": "time",
    "calendar__cell calendar__currency": "currency",
    "calendar__cell calendar__impact": "impact",
    "calendar__cell calendar__event event": "event",
    "calendar__cell calendar__actual": "actual",
    "calendar__cell calendar__forecast": "forecast",
    "calendar__cell calendar__previous": "previous"
}

# Define excluded element types to avoid during scraping
EXCLUDED_ELEMENT_TYPES = ["calendar__cell calendar__graph"]

# Map icon classes to impact colors
ICON_COLOR_MAP = {
    "icon icon--ff-impact-yel": "yellow",
    "icon icon--ff-impact-ora": "orange",
    "icon icon--ff-impact-red": "red",
    "icon icon--ff-impact-gra": "gray"
}

# Define allowed currency codes for filtering
ALLOWED_CURRENCY_CODES = ['AUD', 'CAD', 'CHF', 'CNY', 'EUR', 'GBP', 'JPY', 'NZD', 'USD']

# Define allowed impact colors for filtering
ALLOWED_IMPACT_COLORS = ['red', 'orange', 'yellow', 'gray']

# Function to read JSON data from a file
def read_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

# Function to check if the text contains a day or a month
def contains_day_or_month(text):
    days_of_week = r'\b(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b'
    months = r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b'
    pattern = f'({days_of_week}|{months})'

    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None

    matched_text = match.group(0)
    if re.match(days_of_week, matched_text, re.IGNORECASE):
        return True, matched_text

# Function to find and categorize patterns in text
def find_pattern_category(text):
    time_pattern = r'\d{1,2}:\d{2}(am|pm)'
    day_pattern = r'Day\s+\d+'
    date_range_pattern = r'\d{1,2}(st|nd|rd|th)\s*-\s*\d{1,2}(st|nd|rd|th)'
    tentative_pattern = r'\bTentative\b'
    pattern = f'({time_pattern}|{day_pattern}|{date_range_pattern}|{tentative_pattern})'
    match = re.search(pattern, text, re.IGNORECASE)

    if not match:
        return False, None, None

    matched_text = match.group(0)
    if re.match(time_pattern, matched_text, re.IGNORECASE):
        category = "time"
    elif re.match(day_pattern, matched_text, re.IGNORECASE):
        category = "day_reference"
    elif re.match(date_range_pattern, matched_text, re.IGNORECASE):
        category = "date_range"
    elif re.match(tentative_pattern, matched_text, re.IGNORECASE):
        category = "tentative"
    else:
        category = "Unknown"
    return True, category, matched_text

# Function to reformat and save scraped data
def reformat_scraped_data(data, month):
    # Extract year and month from the URL or use the current month/year
    match = re.search(r'calendar\?month=(\w+)\.(\d{4})', url)
    if match:
        month_str = match.group(1).capitalize()
        year = match.group(2)

        month_map = {
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
            'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
            'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
        }
        month_num = month_map.get(month_str, '01')
    else:
        year = datetime.now().strftime("%Y")
        month_num = datetime.now().strftime("%m")

    current_date = ''
    current_time = ''
    structured_rows = []

    # Iterate through the data and structure it
    for row in data:
        currency = None
        impact = None
        event = None
        actual = "N/A"
        forecast = "N/A"
        previous = "N/A"

        if len(row) == 8:
            match, day = contains_day_or_month(row[0])
            if match:
                current_date = row[0].replace(day, "").replace("\n", "")
            current_time = row[1]
            currency = row[2]
            impact = row[3]
            event = row[4]
            actual = row[5] or "N/A"
            forecast = row[6] or "N/A"
            previous = row[7] or "N/A"
        elif len(row) == 7:
            current_time = row[0]
            currency = row[1]
            impact = row[2]
            event = row[3]
            actual = row[4] or "N/A"
            forecast = row[5] or "N/A"
            previous = row[6] or "N/A"
        elif len(row) == 6:
            currency = row[0]
            impact = row[1]
            event = row[2]
            actual = row[3] or "N/A"
            forecast = row[4] or "N/A"
            previous = row[5] or "N/A"

        # Filter by allowed currency codes and impact colors
        if currency not in ALLOWED_CURRENCY_CODES:
            continue
        if impact not in ALLOWED_IMPACT_COLORS:
            continue

        structured_rows.append([current_date, current_time, currency, impact, event, actual, forecast, previous])

    # Create a DataFrame and save to CSV
    df = pd.DataFrame(structured_rows, columns=['date', 'time', 'currency', 'impact', 'event', 'actual', 'forecast', 'previous'])
    output_dir = "/Users/datpro/Documents/gitdatpro/ff-transform-data/data/bronze/monthly"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"{year}_{month_num}_ff_data.csv")
    df.to_csv(output_path, index=False)
    print("Save .csv successfully")
    return df

# Initialize Chrome WebDriver and scrape data from the website
try:
    driver = webdriver.Chrome()
except:
    print("AF: No Chrome webdriver installed")
    driver = webdriver.Chrome(ChromeDriverManager().install())

# URL of the page to scrape
url = "https://www.forexfactory.com/calendar?month=jul.2024"
driver.get(url)

month = datetime.now().strftime("%B")
table = driver.find_element(By.CLASS_NAME, "calendar__table")

data = []
previous_row_count = 0

# Scroll to the end of the page to load all content
while True:
    before_scroll = driver.execute_script("return window.pageYOffset;")
    driver.execute_script("window.scrollTo(0, window.pageYOffset + 500);")
    time.sleep(2)
    after_scroll = driver.execute_script("return window.pageYOffset;")
    if before_scroll == after_scroll:
        break

# Collect data from the table rows
for row in table.find_elements(By.TAG_NAME, "tr"):
    row_data = []
    for element in row.find_elements(By.TAG_NAME, "td"):
        class_name = element.get_attribute('class')
        if class_name in ALLOWED_ELEMENT_TYPES:
            if element.text:
                row_data.append(element.text)
            elif "calendar__impact" in class_name:
                impact_elements = element.find_elements(By.TAG_NAME, "span")
                for impact in impact_elements:
                    impact_class = impact.get_attribute("class")
                    color = ICON_COLOR_MAP.get(impact_class, "impact")
                if color:
                    row_data.append(color)
                else:
                    row_data.append("impact")
            elif class_name == "calendar__cell calendar__actual":
                row_data.append(element.text or "N/A")
            elif class_name == "calendar__cell calendar__forecast":
                row_data.append(element.text or "N/A")
            elif class_name == "calendar__cell calendar__previous":
                row_data.append(element.text or "N/A")

    if len(row_data):
        data.append(row_data)

# Process and save the scraped data
df = reformat_scraped_data(data, month)

# Display the resulting DataFrame
print(df)

# Close the Chrome browser after completion
driver.quit()

# Enhance with automation processing to download multiple months

In [15]:
# Import necessary libraries
from datetime import datetime
import os
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from webdriver_manager.chrome import ChromeDriverManager

# Define allowed element types for scraping
ALLOWED_ELEMENT_TYPES = {
    "calendar__cell": "date",
    "calendar__cell calendar__date": "date",
    "calendar__cell calendar__time": "time",
    "calendar__cell calendar__currency": "currency",
    "calendar__cell calendar__impact": "impact",
    "calendar__cell calendar__event event": "event",
    "calendar__cell calendar__actual": "actual",
    "calendar__cell calendar__forecast": "forecast",
    "calendar__cell calendar__previous": "previous"
}

# Define excluded element types to avoid during scraping
EXCLUDED_ELEMENT_TYPES = ["calendar__cell calendar__graph"]

# Map icon classes to impact colors
ICON_COLOR_MAP = {
    "icon icon--ff-impact-yel": "yellow",
    "icon icon--ff-impact-ora": "orange",
    "icon icon--ff-impact-red": "red",
    "icon icon--ff-impact-gra": "gray"
}

# Define allowed currency codes for filtering
ALLOWED_CURRENCY_CODES = ['AUD', 'CAD', 'CHF', 'CNY', 'EUR', 'GBP', 'JPY', 'NZD', 'USD']

# Define allowed impact colors for filtering
ALLOWED_IMPACT_COLORS = ['red', 'orange', 'yellow', 'gray']

# Function to reformat and save scraped data
def reformat_scraped_data(data, year, month_num):
    current_date = ''
    current_time = ''
    structured_rows = []

    # Iterate through the data and structure it
    for row in data:
        currency = None
        impact = None
        event = None
        actual = "N/A"
        forecast = "N/A"
        previous = "N/A"

        if len(row) == 8:
            match, day = contains_day_or_month(row[0])
            if match:
                current_date = row[0].replace(day, "").replace("\n", "")
            current_time = row[1]
            currency = row[2]
            impact = row[3]
            event = row[4]
            actual = row[5] or "N/A"
            forecast = row[6] or "N/A"
            previous = row[7] or "N/A"
        elif len(row) == 7:
            current_time = row[0]
            currency = row[1]
            impact = row[2]
            event = row[3]
            actual = row[4] or "N/A"
            forecast = row[5] or "N/A"
            previous = row[6] or "N/A"
        elif len(row) == 6:
            currency = row[0]
            impact = row[1]
            event = row[2]
            actual = row[3] or "N/A"
            forecast = row[4] or "N/A"
            previous = row[5] or "N/A"

        # Filter by allowed currency codes and impact colors
        if currency not in ALLOWED_CURRENCY_CODES:
            continue
        if impact not in ALLOWED_IMPACT_COLORS:
            continue

        structured_rows.append([current_date, current_time, currency, impact, event, actual, forecast, previous])

    # Create a DataFrame and save to CSV
    df = pd.DataFrame(structured_rows, columns=['date', 'time', 'currency', 'impact', 'event', 'actual', 'forecast', 'previous'])
    output_dir = "/Users/datpro/Documents/gitdatpro/ff-transform-data/data/bronze/monthly"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"{year}_{month_num}_ff_data.csv")
    df.to_csv(output_path, index=False)

    return df

# Initialize Chrome WebDriver
try:
    driver = webdriver.Chrome()
except:
    print("AF: No Chrome webdriver installed")
    driver = webdriver.Chrome(ChromeDriverManager().install())

# Loop through the months from January 2024 to July 2024
for month_num in range(2, 8):
    year = "2024"
    # Format the month to a three-letter abbreviation
    month_str = datetime.strptime(str(month_num), "%m").strftime("%b").lower()

    # Construct the URL for the specific month
    url = f"https://www.forexfactory.com/calendar?month={month_str}.{year}"
    driver.get(url)
    print(f"Accessing data for {month_str.capitalize()} {year}")

    table = driver.find_element(By.CLASS_NAME, "calendar__table")
    data = []

    # Scroll to the end of the page to load all content
    while True:
        before_scroll = driver.execute_script("return window.pageYOffset;")
        driver.execute_script("window.scrollTo(0, window.pageYOffset + 500);")
        time.sleep(2)
        after_scroll = driver.execute_script("return window.pageYOffset;")
        if before_scroll == after_scroll:
            break

    # Collect data from the table rows
    for row in table.find_elements(By.TAG_NAME, "tr"):
        row_data = []
        for element in row.find_elements(By.TAG_NAME, "td"):
            class_name = element.get_attribute('class')
            if class_name in ALLOWED_ELEMENT_TYPES:
                if element.text:
                    row_data.append(element.text)
                elif "calendar__impact" in class_name:
                    impact_elements = element.find_elements(By.TAG_NAME, "span")
                    for impact in impact_elements:
                        impact_class = impact.get_attribute("class")
                        color = ICON_COLOR_MAP.get(impact_class, "impact")
                    if color:
                        row_data.append(color)
                    else:
                        row_data.append("impact")
                elif class_name == "calendar__cell calendar__actual":
                    row_data.append(element.text or "N/A")
                elif class_name == "calendar__cell calendar__forecast":
                    row_data.append(element.text or "N/A")
                elif class_name == "calendar__cell calendar__previous":
                    row_data.append(element.text or "N/A")

        if len(row_data):
            data.append(row_data)

    # Process and save the scraped data
    df = reformat_scraped_data(data, year, month_num)

    # Display the resulting DataFrame
    print(df.head())

# Close the Chrome browser after completion
driver.quit()
