In [8]:
import os
import pandas as pd
from datetime import datetime, timedelta

def load_data(file_path):
    """Load data from a CSV file in chunks and parse the 'Dt' column as a datetime."""
    chunk_list = []  # List to hold the chunks of data
    required_columns = ['Dt', 'Sta']  # Specify only required columns

    # Read the CSV file in chunks
    for chunk in pd.read_csv(file_path, usecols=required_columns, chunksize=10000, low_memory=True):
        # Convert 'Dt' to datetime and ensure correct data type
        chunk['Dt'] = pd.to_datetime(chunk['Dt'], errors='coerce')
        chunk_list.append(chunk)
        print(chunk)
    # Concatenate all chunks into a single DataFrame if you want to keep all data
    df = pd.concat(chunk_list, ignore_index=True)
    return df

def filter_data(df, cutoff_date):
    """Filter the DataFrame to include only rows where 'Dt' is less than the cutoff date."""
    return df[df['Dt'] < cutoff_date]

def sort_data(df):
    """Sort the DataFrame by the 'Dt' column."""
    return df.sort_values(by='Dt')

def group_by_station(df):
    """Group the DataFrame by 'Sta' and count entries."""
    return df.groupby("Sta").size().reset_index(name='Count')

def save_station_csvs(station_counts, df_sorted):
    """Save individual CSV files for each station."""
    save_csvs = input("Do you want to save CSV files for each station? (yes/no): ").strip().lower()

    if save_csvs == 'yes':
        output_dir = r"/content/drive/MyDrive/N-Beats Final Model (lb = 1200)/station_csvs"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for sta in station_counts['Sta']:
            station_data = df_sorted[df_sorted['Sta'] == sta]
            count = station_counts[station_counts['Sta'] == sta]['Count'].values[0]
            file_name = f"{sta}_entries_{count}.csv"
            file_path = os.path.join(output_dir, file_name)
            station_data.to_csv(file_path, index=False)
            print(f"Saved CSV for station {sta} with {count} entries at {file_path}")

def find_missing_dates(df_sorted):
    """Find and return missing dates in the sorted DataFrame."""
    min_date = df_sorted['Dt'].min().date()
    max_date = df_sorted['Dt'].max().date()

    full_date_range = pd.date_range(min_date, max_date, freq='D')
    present_dates = df_sorted['Dt'].dt.date.unique()

    missing_dates = full_date_range[~full_date_range.isin(present_dates)]
    return missing_dates

def count_missing_days(missing_dates):
    """Count the number of missing days."""
    return len(missing_dates)

def find_missing_date_ranges(missing_dates):
    """Find and return ranges of consecutive missing dates."""
    missing_date_ranges = []
    missing_days_count = len(missing_dates)

    if missing_days_count > 0:
        start_date = missing_dates[0]
        end_date = missing_dates[0]

        for i in range(1, missing_days_count):
            if missing_dates[i] == missing_dates[i - 1] + timedelta(days=1):
                end_date = missing_dates[i]
            else:
                missing_date_ranges.append((start_date, end_date))
                start_date = missing_dates[i]
                end_date = missing_dates[i]

        # Append the last range
        missing_date_ranges.append((start_date, end_date))

    return missing_date_ranges

def process_time_series_data(file_path):
    """Process the time series data by calling other functions."""
    # Get today's date and cutoff date (one day before today) as a datetime object
    today = datetime.now()
    today = pd.Timestamp(today).tz_localize('UTC')
    cutoff_date = today - timedelta(days=1)

    df = load_data(file_path)
    df_filtered = filter_data(df, cutoff_date)
    df_sorted = sort_data(df_filtered)

    station_counts = group_by_station(df_sorted)
    print("Station Entry Counts:")
    print(station_counts)

    save_station_csvs(station_counts, df_sorted)

    missing_dates = find_missing_dates(df_sorted)
    print("\nMissing Dates:")
    print(missing_dates)

    missing_days_count = count_missing_days(missing_dates)
    print(f"\nCount of Missing Days: {missing_days_count}")

    missing_date_ranges = find_missing_date_ranges(missing_dates)
    print("\nMissing Date Ranges:")
    for start, end in missing_date_ranges:
        print(f"From {start} to {end} ({(end - start).days + 1} days)")

    return df_sorted, station_counts, missing_dates, missing_date_ranges

# Use the file path to your dataset
file_path = r'/content/drive/MyDrive/N-Beats Final Model (lb = 1200)/Aggregated_Data.csv'  # Replace with the actual file path
processed_df, station_entry_counts, missing_dates, missing_ranges = process_time_series_data(file_path)


ParserError: Error tokenizing data. C error: out of memory

In [None]:
# If you want to ensure correct date format, you can first load it with pandas:
    df_pandas = pd.read_csv(file_path)
    df_pandas['Dt'] = pd.to_datetime(df_pandas['Dt'], format='%Y-%m-%dT%H:%M:%S.%fZ', errors='coerce')  # Adjust format as needed
    df = pl.from_pandas(df_pandas)  # Convert back to Polars DataFrame if needed