# Data Cleaning

This notebook focuses on cleaning the data collected for time series forecasting. It handles missing values and ensures the data is formatted correctly as a time series with a datetime index.

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
data_path = '../data/raw/your_data_file.csv'  # Update with your raw data file
df = pd.read_csv(data_path)

# Display the first few rows of the dataframe
df.head()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

In [None]:
# Handle missing values
# Example: Fill missing values with the forward fill method
df.fillna(method='ffill', inplace=True)

# Alternatively, you can drop rows with missing values
# df.dropna(inplace=True)

# Check again for missing values
df.isnull().sum().sum()

In [None]:
# Convert the date column to datetime format
df['date'] = pd.to_datetime(df['date'])  # Update 'date' with your actual date column name

# Set the date column as the index
df.set_index('date', inplace=True)

# Display the cleaned dataframe
df.head()

In [None]:
# Save the cleaned data to the processed directory
processed_data_path = '../data/processed/cleaned_data.csv'  # Update with your desired processed data file name
df.to_csv(processed_data_path)

# Confirm the data has been saved
pd.read_csv(processed_data_path).head()