In [1]:
# %% [1] required libraries
import os
import sys

import numpy as np


In [18]:
# Add the parent directory of 'source' to the Python path
sys.path.append(os.path.abspath("../source"))

In [19]:
# Enable IterativeImputer
#local script import
from source.utils.config_loader import load_config
from source.utils.logger import setup_logger
from source.data_preprocessing import (
    load_data,
    fill_missing_values,
    scale_features,
    visualize_missing_values,
    correlation_analysis,
    detect_outliers,
    basic_info,
)



In [20]:
# Paths
RAW_DIR = "../data/raw"
PROCESSED_DIR = "../data/processed"
PLOTS_DIR = "../plots/"
LOG_DIR = "../logs/"

In [21]:
# Logger setup
logger = setup_logger(name="data_preprocessing", log_file=os.path.join(LOG_DIR, "data_preprocessing.log"),
                      log_level="INFO")

In [22]:
# Config file path
CONFIG_PATH = "../config/settings.yml"
config = load_config(CONFIG_PATH)

#  Load the configuration file
RAW_DIR = config["paths"]["raw_dir"]
PROCESSED_DIR = config["paths"]["processed_dir"]
PLOTS_DIR = config["paths"].get("plots_dir", "../plots")


In [23]:
#  Check if the directories exist and create them if they don't
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
logger.info("Notebook initialized.")

2025-01-08 17:20:41,147 - data_preprocessing - INFO - Notebook initialized.
2025-01-08 17:20:41,147 - data_preprocessing - INFO - Notebook initialized.
2025-01-08 17:20:41,147 - data_preprocessing - INFO - Notebook initialized.
2025-01-08 17:20:41,147 - data_preprocessing - INFO - Notebook initialized.
INFO:data_preprocessing:Notebook initialized.


In [25]:
# %% Data Loading
file_path = os.path.join(PROCESSED_DIR, "combined_data.csv")

try:
    df = load_data(file_path)
    logger.info("Data successfully loaded.")
except Exception as e:
    logger.error(f"Error loading data: {e}")
    raise

#  basic information
info = basic_info(df)
print(info)


2025-01-08 17:25:07,956 - data_preprocessing - INFO - Loading data from ../data/processed/combined_data.csv
2025-01-08 17:25:07,956 - data_preprocessing - INFO - Loading data from ../data/processed/combined_data.csv
2025-01-08 17:25:07,956 - data_preprocessing - INFO - Loading data from ../data/processed/combined_data.csv
2025-01-08 17:25:07,956 - data_preprocessing - INFO - Loading data from ../data/processed/combined_data.csv
INFO:data_preprocessing:Loading data from ../data/processed/combined_data.csv
  data = pd.read_csv(file_path)
2025-01-08 17:25:09,684 - data_preprocessing - INFO - Data loaded successfully
2025-01-08 17:25:09,684 - data_preprocessing - INFO - Data loaded successfully
2025-01-08 17:25:09,684 - data_preprocessing - INFO - Data loaded successfully
2025-01-08 17:25:09,684 - data_preprocessing - INFO - Data loaded successfully
INFO:data_preprocessing:Data loaded successfully
2025-01-08 17:25:09,686 - data_preprocessing - INFO - Data successfully loaded.
2025-01-08 17

{'Shape': (1458216, 26), 'Columns': ['Date', 'Source', 'Site ID', 'POC', 'Daily Max 1-hour SO2 Concentration', 'Units', 'Daily AQI Value', 'Local Site Name', 'Daily Obs Count', 'Percent Complete', 'AQS Parameter Code', 'AQS Parameter Description', 'Method Code', 'CBSA Code', 'CBSA Name', 'State FIPS Code', 'State', 'County FIPS Code', 'County', 'Site Latitude', 'Site Longitude', 'Daily Max 8-hour Ozone Concentration', 'Daily Mean PM2.5 Concentration', 'Method Description', 'Daily Max 8-hour CO Concentration', 'Daily Max 1-hour NO2 Concentration'], 'Data Types': {'Date': dtype('O'), 'Source': dtype('O'), 'Site ID': dtype('int64'), 'POC': dtype('int64'), 'Daily Max 1-hour SO2 Concentration': dtype('float64'), 'Units': dtype('O'), 'Daily AQI Value': dtype('int64'), 'Local Site Name': dtype('O'), 'Daily Obs Count': dtype('int64'), 'Percent Complete': dtype('float64'), 'AQS Parameter Code': dtype('int64'), 'AQS Parameter Description': dtype('O'), 'Method Code': dtype('float64'), 'CBSA Code'

In [26]:
# %%  Visualizing Missing Values
logger.info("Visualizing missing values.")
visualize_missing_values(df)


2025-01-08 17:25:15,556 - data_preprocessing - INFO - Visualizing missing values.
2025-01-08 17:25:15,556 - data_preprocessing - INFO - Visualizing missing values.
2025-01-08 17:25:15,556 - data_preprocessing - INFO - Visualizing missing values.
2025-01-08 17:25:15,556 - data_preprocessing - INFO - Visualizing missing values.
INFO:data_preprocessing:Visualizing missing values.
2025-01-08 17:25:15,559 - data_preprocessing - INFO - Visualizing missing values.
2025-01-08 17:25:15,559 - data_preprocessing - INFO - Visualizing missing values.
2025-01-08 17:25:15,559 - data_preprocessing - INFO - Visualizing missing values.
2025-01-08 17:25:15,559 - data_preprocessing - INFO - Visualizing missing values.
INFO:data_preprocessing:Visualizing missing values.
2025-01-08 17:25:18,277 - data_preprocessing - INFO - Plot saved to /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/plots/missing_values_matrix.png
2025-01-08 17:25:18,277 - data_preprocessing - INFO - Plot saved to /Users/dogaaydin/

In [27]:
# %%  Filling Missing Values
logger.info("Filling missing values using mean method.")
df = fill_missing_values(df, method="mean")

#  basic information
info_after_filling = basic_info(df)
print(info_after_filling)


2025-01-08 17:25:23,401 - data_preprocessing - INFO - Filling missing values using mean method.
2025-01-08 17:25:23,401 - data_preprocessing - INFO - Filling missing values using mean method.
2025-01-08 17:25:23,401 - data_preprocessing - INFO - Filling missing values using mean method.
2025-01-08 17:25:23,401 - data_preprocessing - INFO - Filling missing values using mean method.
INFO:data_preprocessing:Filling missing values using mean method.
2025-01-08 17:25:23,404 - data_preprocessing - INFO - Filling missing values using method: mean
2025-01-08 17:25:23,404 - data_preprocessing - INFO - Filling missing values using method: mean
2025-01-08 17:25:23,404 - data_preprocessing - INFO - Filling missing values using method: mean
2025-01-08 17:25:23,404 - data_preprocessing - INFO - Filling missing values using method: mean
INFO:data_preprocessing:Filling missing values using method: mean
2025-01-08 17:25:23,745 - data_preprocessing - INFO - Missing values filled successfully.
2025-01-08

{'Shape': (1458216, 26), 'Columns': ['Date', 'Source', 'Site ID', 'POC', 'Daily Max 1-hour SO2 Concentration', 'Units', 'Daily AQI Value', 'Local Site Name', 'Daily Obs Count', 'Percent Complete', 'AQS Parameter Code', 'AQS Parameter Description', 'Method Code', 'CBSA Code', 'CBSA Name', 'State FIPS Code', 'State', 'County FIPS Code', 'County', 'Site Latitude', 'Site Longitude', 'Daily Max 8-hour Ozone Concentration', 'Daily Mean PM2.5 Concentration', 'Method Description', 'Daily Max 8-hour CO Concentration', 'Daily Max 1-hour NO2 Concentration'], 'Data Types': {'Date': dtype('O'), 'Source': dtype('O'), 'Site ID': dtype('int64'), 'POC': dtype('int64'), 'Daily Max 1-hour SO2 Concentration': dtype('float64'), 'Units': dtype('O'), 'Daily AQI Value': dtype('int64'), 'Local Site Name': dtype('O'), 'Daily Obs Count': dtype('int64'), 'Percent Complete': dtype('float64'), 'AQS Parameter Code': dtype('int64'), 'AQS Parameter Description': dtype('O'), 'Method Code': dtype('float64'), 'CBSA Code'

In [28]:
# Scaling Numeric Columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

logger.info("Scaling numeric columns using standard method.")
df = scale_features(df, numeric_cols, method="standard")


2025-01-08 17:25:28,906 - data_preprocessing - INFO - Scaling numeric columns using standard method.
2025-01-08 17:25:28,906 - data_preprocessing - INFO - Scaling numeric columns using standard method.
2025-01-08 17:25:28,906 - data_preprocessing - INFO - Scaling numeric columns using standard method.
2025-01-08 17:25:28,906 - data_preprocessing - INFO - Scaling numeric columns using standard method.
INFO:data_preprocessing:Scaling numeric columns using standard method.
2025-01-08 17:25:28,909 - data_preprocessing - INFO - Scaling features: ['Site ID', 'POC', 'Daily Max 1-hour SO2 Concentration', 'Daily AQI Value', 'Daily Obs Count', 'Percent Complete', 'AQS Parameter Code', 'Method Code', 'CBSA Code', 'State FIPS Code', 'County FIPS Code', 'Site Latitude', 'Site Longitude', 'Daily Max 8-hour Ozone Concentration', 'Daily Mean PM2.5 Concentration', 'Daily Max 8-hour CO Concentration', 'Daily Max 1-hour NO2 Concentration'] using method: standard
2025-01-08 17:25:28,909 - data_preprocessi

In [29]:
#  Correlation Analysis
logger.info("Performing correlation analysis.")
correlation_analysis(df, numeric_cols)


2025-01-08 17:25:32,525 - data_preprocessing - INFO - Performing correlation analysis.
2025-01-08 17:25:32,525 - data_preprocessing - INFO - Performing correlation analysis.
2025-01-08 17:25:32,525 - data_preprocessing - INFO - Performing correlation analysis.
2025-01-08 17:25:32,525 - data_preprocessing - INFO - Performing correlation analysis.
INFO:data_preprocessing:Performing correlation analysis.
2025-01-08 17:25:32,528 - data_preprocessing - INFO - Performing correlation analysis.
2025-01-08 17:25:32,528 - data_preprocessing - INFO - Performing correlation analysis.
2025-01-08 17:25:32,528 - data_preprocessing - INFO - Performing correlation analysis.
2025-01-08 17:25:32,528 - data_preprocessing - INFO - Performing correlation analysis.
INFO:data_preprocessing:Performing correlation analysis.
2025-01-08 17:25:33,573 - data_preprocessing - INFO - Plot saved to /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/plots/correlation_matrix.png
2025-01-08 17:25:33,573 - data_preproc

In [30]:
# Detecting Outliers
logger.info("Detecting outliers in numeric columns.")
detect_outliers(df, numeric_cols)


2025-01-08 17:25:35,444 - data_preprocessing - INFO - Detecting outliers in numeric columns.
2025-01-08 17:25:35,444 - data_preprocessing - INFO - Detecting outliers in numeric columns.
2025-01-08 17:25:35,444 - data_preprocessing - INFO - Detecting outliers in numeric columns.
2025-01-08 17:25:35,444 - data_preprocessing - INFO - Detecting outliers in numeric columns.
INFO:data_preprocessing:Detecting outliers in numeric columns.
2025-01-08 17:25:35,448 - data_preprocessing - INFO - Detecting outliers in numeric columns.
2025-01-08 17:25:35,448 - data_preprocessing - INFO - Detecting outliers in numeric columns.
2025-01-08 17:25:35,448 - data_preprocessing - INFO - Detecting outliers in numeric columns.
2025-01-08 17:25:35,448 - data_preprocessing - INFO - Detecting outliers in numeric columns.
INFO:data_preprocessing:Detecting outliers in numeric columns.
2025-01-08 17:25:35,526 - data_preprocessing - INFO - Plot saved to /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/plots/o

In [31]:
#  Data Preprocessing
output_file = os.path.join(PROCESSED_DIR, "epa_preprocessed.csv")

try:
    df.to_csv(output_file, index=False)
    logger.info(f"Preprocessed data saved successfully at: {output_file}")
except Exception as e:
    logger.error(f"Error saving preprocessed data: {e}")
    raise


2025-01-08 17:25:58,438 - data_preprocessing - INFO - Preprocessed data saved successfully at: ../data/processed/epa_preprocessed.csv
2025-01-08 17:25:58,438 - data_preprocessing - INFO - Preprocessed data saved successfully at: ../data/processed/epa_preprocessed.csv
2025-01-08 17:25:58,438 - data_preprocessing - INFO - Preprocessed data saved successfully at: ../data/processed/epa_preprocessed.csv
2025-01-08 17:25:58,438 - data_preprocessing - INFO - Preprocessed data saved successfully at: ../data/processed/epa_preprocessed.csv
INFO:data_preprocessing:Preprocessed data saved successfully at: ../data/processed/epa_preprocessed.csv
