<a href="https://colab.research.google.com/github/deepthi1707/deepthi1707/blob/main/Deepthisree_14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import os

sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

def load_air_quality_data(file_path):
    try:
        df = pd.read_excel(file_path)
        print("✅ Data loaded successfully!")
        print(f"Shape: {df.shape}")
        print("\nFirst 5 rows:")
        print(df.head())
        return df
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        return None

def clean_air_quality_data(df):
    df = df.drop(columns=[col for col in df.columns if 'Unnamed' in col], errors='ignore')

    if 'Date' in df.columns and 'Time' in df.columns:
        try:
            df['DateTime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str), errors='coerce')
            df = df.drop(columns=['Date', 'Time'])
        except Exception as e:
            print(f"⚠️ Date/Time parsing failed: {e}")
    elif 'DateTime' in df.columns:
        df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')

    df = df.set_index('DateTime')
    df = df.mask(df < 0)

    print("\nMissing values before cleaning:")
    print(df.isnull().sum())

    imputer = SimpleImputer(strategy='median')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)

    print("\nMissing values after cleaning:")
    print(df_imputed.isnull().sum())

    return df_imputed

def analyze_air_quality(df):
    print("\n📊 Descriptive Statistics:")
    print(df.describe())

    print("\n🔗 Correlation Matrix:")
    corr_matrix = df.corr()
    print(corr_matrix)

    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".2f")
    plt.title('Air Quality Parameters Correlation')
    plt.tight_layout()
    plt.show()

    for column in df.columns:
        if df[column].nunique() > 1:
            plt.figure(figsize=(14, 6))
            df[column].plot(title=f'{column} Time Series', color='royalblue')
            plt.ylabel('Concentration')
            plt.xlabel('Date')
            plt.tight_layout()
            plt.show()

if __name__ == "__main__":
    file_path = r"C:\Users\Deepu Laptop\Desktop\AirQualityUCI.xlsx"

    if not os.path.exists(file_path):
        print("❌ The specified file path does not exist. Please check and try again.")
    else:
        air_quality_df = load_air_quality_data(file_path)

        if air_quality_df is None:
            print("\n❗Could not load data file. Please check the path.")
        else:
            cleaned_df = clean_air_quality_data(air_quality_df)
            analyze_air_quality(cleaned_df)
