In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import mplfinance as mpf

In [None]:
# Load Forex tick data
data_dir = "../data/raw/bars/m1_20200101_20241231/"
pkl_dir = "../data/pkl"
os.makedirs(pkl_dir, exist_ok=True)
pkl_file_path = os.path.join(pkl_dir, "forex_main_m1_20200101_20241231.pkl")


In [None]:
merged_df = pd.DataFrame()

# Iterate through all files in the directory
for file_name in os.listdir(data_dir):
    file_path = os.path.join(data_dir, file_name)
    if os.path.isfile(file_path) and file_name.endswith(".csv"):
        # Extract symbol name from file name (assuming file name format: symbol.csv)
        symbol_name = os.path.splitext(file_name)[0]

        # Read the CSV file
        df = pd.read_csv(file_path)
        df = df[["timestamp", "close"]]
        df["timestamp"] = pd.to_datetime(df["timestamp"], unit='ms')

        # Set timestamp as index for merging
        df.set_index("timestamp", inplace=True)

        # Rename the 'close' column to the symbol name
        df.rename(columns={"close": symbol_name}, inplace=True)

        # Merge the DataFrame with the main DataFrame on the timestamp index
        if merged_df.empty:
            merged_df = df
        else:
            merged_df = merged_df.join(df, how='outer')

# Reset the index to have the timestamp as a column again
merged_df.reset_index(inplace=True)


In [None]:
# Check for missing values in the merged dataframe
print("Missing values in the dataset:")
print(merged_df.isnull().sum())

# Check for duplicates in the 'timestamp' column
print("\nDuplicate timestamps in the dataset:")
print(merged_df['timestamp'].duplicated().sum())

# Check if all timestamps are unique
print("\nAre all timestamps unique?")
print(merged_df['timestamp'].is_unique)

# Check if timestamps are sorted in ascending order
print("\nAre the timestamps sorted in ascending order?")
print(merged_df['timestamp'].is_monotonic_increasing)

# Check for the range of dates in the dataset (start and end)
print("\nDate range in the dataset:")
print(merged_df['timestamp'].min(), "to", merged_df['timestamp'].max())

# Check the basic statistics for the 'close' columns to identify any anomalies
print("\nBasic statistics for the 'close' columns:")
print(merged_df.describe())

# Check the first few rows to verify the data
print("\nFirst few rows of the merged dataset:")
print(merged_df.head())


In [None]:
# Drop rows with NaN values
merged_df_clean = merged_df.dropna()

# Check for continuity in timestamps (no gaps in time)
time_diff = merged_df_clean['timestamp'].diff().dt.total_seconds()
time_gaps = time_diff[time_diff > 60]  # Assuming the data is expected to have 1-minute intervals
print("\nTime gaps detected (greater than 1 minute):")
print(time_gaps)

# Summarize data quality
print("\nSummary of data quality:")
print(f"Number of rows before dropping NaN: {len(merged_df)}")
print(f"Number of rows after dropping NaN: {len(merged_df_clean)}")
print(f"Missing values after cleaning:")
print(merged_df_clean.isnull().sum())
print(f"First few rows of the cleaned dataset:")
print(merged_df_clean.head())

# Check for the range of dates in the cleaned dataset
print("\nDate range in the cleaned dataset:")
print(merged_df_clean['timestamp'].min(), "to", merged_df_clean['timestamp'].max())

# Check if timestamps are sorted after dropping NaN
print("\nAre the timestamps sorted in ascending order after cleaning?")
print(merged_df_clean['timestamp'].is_monotonic_increasing)

# Basic statistics for the 'close' columns in the cleaned dataset
print("\nBasic statistics for the 'close' columns after cleaning:")
print(merged_df_clean.describe())


In [None]:
# Calculate the difference between consecutive timestamps
time_diff = merged_df_clean['timestamp'].diff().dt.total_seconds()

# Define a gap threshold (e.g., 60 seconds) to identify the breaks between continuous time
gap_threshold = 60  # You can adjust this if your data uses a different time resolution

# Create a timegroup label based on where gaps exceed the threshold
merged_df_clean['timegroup'] = (time_diff > gap_threshold).cumsum()

# Print the first few rows to verify the new column
print(merged_df_clean[['timestamp', 'timegroup']].head())


In [None]:
merged_df_clean['timegroup'].unique()

In [None]:
# Group by timegroup and count the number of entries in each group
timegroup_lengths = merged_df_clean.groupby('timegroup').size()

# Print the continuous length of each timegroup
print("Continuous length of each timegroup:")
print(timegroup_lengths)
