In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import sys
from utils.logger import setup_logger
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from source.utils.config_loader import load_config


In [2]:
# Projeye özel local script import:
sys.path.append(os.path.abspath("../source"))

In [4]:
CONFIG_PATH = "../config/settings.yml"

try:
    # Config dosyasını yükle
    config = load_config(CONFIG_PATH)
    print(config)

    # Dizinleri config'den al
    RAW_DIR = config["paths"]["raw_dir"]
    PROCESSED_DIR = config["paths"]["processed_dir"]
    PLOTS_DIR = config["paths"].get("plots_dir", "../plots")
    LOG_DIR = config["paths"].get("logs_dir", "../logs")

    # Dizinlerin varlığını kontrol et ve oluştur
    os.makedirs(PLOTS_DIR, exist_ok=True)
    os.makedirs(LOG_DIR, exist_ok=True)

    # Logger oluştur
    logger = setup_logger(name="data_preprocessing", log_file=os.path.join(LOG_DIR, "data_preprocessing.log"),
                          log_level="INFO")
    logger.info("Config dosyası ve dizinler başarıyla yüklendi.")
except KeyError as e:
    raise ValueError(f"Config dosyasındaki bir anahtar eksik: {e}")
except Exception as e:
    raise RuntimeError(f"Config yüklenirken hata oluştu: {e}")



2025-01-10 16:25:36,653 - data_preprocessing - INFO - Config dosyası ve dizinler başarıyla yüklendi.
2025-01-10 16:25:36,653 - data_preprocessing - INFO - Config dosyası ve dizinler başarıyla yüklendi.
INFO:data_preprocessing:Config dosyası ve dizinler başarıyla yüklendi.


{'project_name': 'Geo_Sentiment_Climate', 'paths': {'raw_dir': '../data/raw/', 'interim_dir': '../data/interim/', 'processed_dir': '../data/processed/', 'metadata_dir': '../data/metadata/', 'plots_dir': '../plots/', 'logs_dir': '../logs/', 'archive_dir': '../data/archive/'}, 'rename_map_common': {'Date': 'date', 'Units': 'units', 'Daily AQI Value': 'aqi', 'State FIPS Code': 'state_fips', 'County FIPS Code': 'county_fips'}, 'rename_map_param': {'so2': 'Daily Max 1-hour SO2 Concentration', 'o3': 'Daily Max 8-hour Ozone Concentration', 'co': 'Daily Max 8-hour CO Concentration', 'no2': 'Daily Max 1-hour NO2 Concentration', 'pm25': 'Daily Mean PM2.5 Concentration'}, 'data_check': {'required_columns': ['Date', 'Source', 'Site ID', 'POC', 'Daily Max 1-hour SO2 Concentration', 'Units', 'Daily AQI Value', 'Local Site Name', 'Daily Obs Count', 'Percent Complete', 'AQS Parameter Code', 'AQS Parameter Description', 'Method Code', 'CBSA Code', 'CBSA Name', 'State FIPS Code', 'State', 'County FIPS C

In [5]:
def load_data(file_path):
    logger.info(f"Loading data from {file_path}")
    try:
        data = pd.read_csv(file_path)
        logger.info("Data loaded successfully")
        return data
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise


def save_plot(plot, filename):
    filepath = os.path.join(PLOTS_DIR, filename)
    plot.savefig(filepath)
    logger.info(f"Plot saved to {filepath}")
    plt.close(plot)


def basic_info(df):
    logger.info("Generating basic info of the dataset")
    info = {
        "Shape": df.shape,
        "Columns": df.columns.tolist(),
        "Data Types": df.dtypes.to_dict(),
        "Missing Values": df.isnull().sum().to_dict()
    }
    logger.info(f"Dataset Info: {info}")
    return info


def visualize_missing_values(df, save=True):
    logger.info("Visualizing missing values.")
    try:
        plt.figure(figsize=(10, 6))
        msno.matrix(df)
        if save:
            plot_path = os.path.join(PLOTS_DIR, "missing_values_matrix.png")
            plt.savefig(plot_path)
            logger.info(f"Missing values matrix plot saved to {plot_path}")
        plt.show()
    except Exception as e:
        logger.error(f"Error in missing values visualization: {e}")
        raise


def fill_missing_values(df, method="mean"):
    logger.info(f"Filling missing values using method: {method}")
    try:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if method == "mean":
            df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
        elif method == "median":
            df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
        elif method == "mode":
            df = df.fillna(df.mode().iloc[0])
        else:
            raise ValueError("Invalid method for missing value handling")
        logger.info("Missing values filled successfully.")
        return df
    except Exception as e:
        logger.error(f"Error filling missing values: {e}")
        raise


def remove_null_values(df, subset_cols):
    logger.info("Removing rows with null values")
    try:
        df_clean = df.dropna(subset=subset_cols)
        logger.info("Null values removed successfully")
        return df_clean
    except Exception as e:
        logger.error(f"Error in removing null values: {e}")
        raise


def distribution_analysis(df, numeric_cols):
    logger.info("Performing distribution analysis for numeric columns")
    try:
        for col in numeric_cols:
            plt.figure(figsize=(8, 4))
            sns.histplot(df[col], kde=True, bins=30, color="blue")
            plt.title(f"Distribution of {col}")
            plt.show()
    except Exception as e:
        logger.error(f"Error in distribution analysis: {e}")


def scale_features(df, numeric_cols, method="standard"):
    logger.info(f"Scaling features: {numeric_cols} using method: {method}")
    try:
        scaler = StandardScaler() if method == "standard" else MinMaxScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
        logger.info("Features scaled successfully.")
        return df
    except Exception as e:
        logger.error(f"Error scaling features: {e}")
        raise


def correlation_analysis(df, numeric_cols):
    logger.info("Performing correlation analysis.")
    try:
        corr_matrix = df[numeric_cols].corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
        plot_path = os.path.join(PLOTS_DIR, "correlation_matrix.png")
        plt.title("Correlation Matrix")
        plt.savefig(plot_path)
        logger.info(f"Correlation matrix plot saved to {plot_path}")
        plt.close()
    except Exception as e:
        logger.error(f"Error in correlation analysis: {e}")


def detect_outliers(df, numeric_cols, save=True):
    logger.info("Detecting outliers in numeric columns.")
    try:
        for col in numeric_cols:
            plt.figure(figsize=(8, 4))
            sns.boxplot(x=df[col])
            plt.title(f"Outliers in {col}")
            if save:
                save_plot(plt, f"outliers_{col}.png")
            plt.show()
    except Exception as e:
        logger.error(f"Error in outlier detection: {e}")


In [6]:
config = load_config("../config/settings.yml")

file_path = os.path.join(PROCESSED_DIR, "epa_long_preprocessed.csv")

try:
    df = load_data(file_path)
    logger.info("Data loaded successfully")
except Exception as e:
    logger.critical(f"Failed to load data: {e}")
    raise

file_path = "data/processed/epa_long_preprocessed.csv"

2025-01-10 16:25:39,994 - data_preprocessing - INFO - Loading data from ../data/processed/epa_long_preprocessed.csv
2025-01-10 16:25:39,994 - data_preprocessing - INFO - Loading data from ../data/processed/epa_long_preprocessed.csv
INFO:data_preprocessing:Loading data from ../data/processed/epa_long_preprocessed.csv
2025-01-10 16:25:40,000 - data_preprocessing - ERROR - Error loading data: [Errno 2] No such file or directory: '../data/processed/epa_long_preprocessed.csv'
2025-01-10 16:25:40,000 - data_preprocessing - ERROR - Error loading data: [Errno 2] No such file or directory: '../data/processed/epa_long_preprocessed.csv'
ERROR:data_preprocessing:Error loading data: [Errno 2] No such file or directory: '../data/processed/epa_long_preprocessed.csv'
2025-01-10 16:25:40,002 - data_preprocessing - CRITICAL - Failed to load data: [Errno 2] No such file or directory: '../data/processed/epa_long_preprocessed.csv'
2025-01-10 16:25:40,002 - data_preprocessing - CRITICAL - Failed to load dat

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/epa_long_preprocessed.csv'