In [1]:
import h5py
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import logging


In [2]:
# Set up logging
logging.basicConfig(filename='preprocessing.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Load and preprocess data
def load_data(file_path):
    logging.info(f"Loading data from {file_path}")
    data = pd.read_csv(file_path)
    logging.info("Data loaded successfully")
    return data

# Clean the data and normalize it using MinMaxScaler
def clean_and_normalize_data(data):
    logging.info("Starting data cleaning and normalization")
    # Step 1: Data Cleaning
    data.fillna(method='ffill', inplace=True)  # Forward fill NaN values
    data.fillna(method='bfill', inplace=True)  # Backward fill in case forward fill doesn't cover all NaNs

    # Normalize the data using MinMaxScaler
    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    logging.info("Data cleaned and normalized successfully")
    return data_scaled, scaler

# Split the data into training and test sets
def split_data(data, train_size=0.8):
    logging.info(f"Splitting data with a train size of {train_size*100}%")
    train_size = int(len(data) * train_size)
    train_data = data[:train_size]
    test_data = data[train_size:]
    logging.info("Data split into training and test sets successfully")
    return train_data, test_data

# Create sequences for time-series data for LSTM/GRU models
def create_sequences(data, time_steps):
    logging.info(f"Creating sequences with time steps = {time_steps}")
    input_sequences, next_step_targets = [], []
    for i in range(len(data) - time_steps):
        input_sequences.append(data[i:i + time_steps, :])
        next_step_targets.append(data[i + time_steps, :])
    logging.info("Sequences created successfully")
    return np.array(input_sequences), np.array(next_step_targets)



# Example usage for preprocessing
if __name__ == "__main__":
    try:
        # Load the data
        df = load_data('https://raw.githubusercontent.com/gmukku/AI_Ops_Project/main/dataset/metr_la_data_with_headers.csv')  # Modify with the correct file path
        
        # Clean and normalize the data
        data_scaled, scaler = clean_and_normalize_data(df)
        
        # Split the data into train and test sets
        train_data, test_data = split_data(data_scaled)
        
        # Set time steps for sequence creation
        time_steps = 10
        
        # Create sequences for training and testing data
        input_sequences_train, next_step_targets_train = create_sequences(train_data.values, time_steps)
        input_sequences_test, next_step_targets_test = create_sequences(test_data.values, time_steps)

        # Save the preprocessed data to .npy files
        np.save('input_sequences_train.npy', input_sequences_train)
        np.save('next_step_targets_train.npy', next_step_targets_train)
        np.save('input_sequences_test.npy', input_sequences_test)
        np.save('next_step_targets_test.npy', next_step_targets_test)

        logging.info("Data preprocessing complete and saved to files.")
        

    except Exception as e:
        logging.error(f"An error occurred: {e}")