In [1]:
# Load data, modify dates, and split into train/val/test sets
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('./RawBigDS.csv')

# Display initial information
print("Original data shape:", df.shape)
print("Sample of original dates:", df['Date'].head())

# Function to transform dates (keeping the original day, changing month to 04 and year to 2025)
def transform_date(date_str):
    try:
        # Try different formats to handle potential variations
        try:
            original_date = datetime.strptime(date_str, '%d-%m-%Y')
            format_str = '%d-%m-%Y'
        except:
            try:
                original_date = datetime.strptime(date_str, '%Y-%m-%d')
                format_str = '%Y-%m-%d'
            except:
                original_date = datetime.strptime(date_str, '%m/%d/%Y')
                format_str = '%m/%d/%Y'
        
        # Keep the original day, set month to 04 and year to 2025
        new_date = original_date.replace(month=4, year=2025)
        
        # Return in the original format
        return new_date.strftime(format_str)
    except Exception as e:
        print(f"Error processing date '{date_str}': {e}")
        return date_str

# Apply the transformation to the Date column
df['Date'] = df['Date'].apply(transform_date)

# Display transformed dates
print("Sample of transformed dates:", df['Date'].head())

# Save the processed data to air_fare_raw.csv
df.to_csv('air_fare_raw.csv', index=False)
print("Saved processed data to air_fare_raw.csv")

# Split the data into train, validation, and test sets (70%, 15%, 15%)
# First split: 70% train, 30% temp
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Second split: Split the 30% into two equal parts (15% validation, 15% test)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Verify the proportions
print(f"Train set: {len(train_df)} rows ({len(train_df)/len(df):.1%})")
print(f"Validation set: {len(val_df)} rows ({len(val_df)/len(df):.1%})")
print(f"Test set: {len(test_df)} rows ({len(test_df)/len(df):.1%})")

# Save the split datasets
train_df.to_csv('air_fare_train.csv', index=False)
val_df.to_csv('air_fare_val.csv', index=False)
test_df.to_csv('air_fare_test.csv', index=False)

print("Successfully saved:")
print("- air_fare_train.csv (70% of data)")
print("- air_fare_val.csv (15% of data)")
print("- air_fare_test.csv (15% of data)")

Original data shape: (12781, 10)
Sample of original dates: 0    15-5-2023
1    15-5-2023
2    15-5-2023
3    15-5-2023
4    15-5-2023
Name: Date, dtype: object
Sample of transformed dates: 0    15-04-2025
1    15-04-2025
2    15-04-2025
3    15-04-2025
4    15-04-2025
Name: Date, dtype: object
Saved processed data to air_fare_raw.csv
Train set: 8946 rows (70.0%)
Validation set: 1917 rows (15.0%)
Test set: 1918 rows (15.0%)
Successfully saved:
- air_fare_train.csv (70% of data)
- air_fare_val.csv (15% of data)
- air_fare_test.csv (15% of data)
