In [160]:
import pandas as pd
import numpy as np

In [161]:
file = "C:/Users/chand/OneDrive/Desktop/Python Project/Raw Data/owid-covid-data.csv"

In [162]:
def load_data(file_path):
    """Load CSV data."""
    try:
        file_path = pd.read_csv(file)
        df = file_path.copy()
        print("Data loaded successfully.")
        print("Data Overview:")
        df.info()
        print(f"\nShape: {df.shape}\n")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [163]:
def trim_whitespace(df):
    """Trim whitespace in string columns."""
    str_cols = df.select_dtypes(include='object').columns
    for col in str_cols:
        df[col] = df[col].astype(str).str.strip().str.replace("  ", " ")
    print("Trimmed whitespace in string columns.")
    return df

In [164]:
def drop_empty_rows(df, columns):
    # Drop rows where any given column is NaN, empty string, or empty list
    try:
        for col in columns:
            if col in df.columns:
                df = df[df[col].notna()]  # Drop NaN
                df = df[df[col].astype(str).str.strip() != '']  # Drop empty string
                df = df[df[col].apply(lambda x: x != [] if isinstance(x, list) else True)]  # Drop empty list
            else:
                print(f"Column '{col}' not found.")
        print("Rows with missing values removed.")
        return df
    except Exception as e:
        print(f"Error: {e}")
        return df

In [165]:
def handle_missing_values(df):
    """Fill missing values: 0 for numeric, mode for non-numeric columns, with exception handling."""
    try:
        num_cols = df.select_dtypes(include=np.number).columns
        cat_cols = df.select_dtypes(exclude=np.number).columns

        for col in num_cols:
            try:
                if df[col].isnull().any():
                    df[col] = df[col].fillna(0)
            except Exception as e:
                print(f"Error filling numeric column '{col}': {e}")

        for col in cat_cols:
            try:
                if df[col].isnull().any():
                    mode_value = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
                    df[col] = df[col].fillna(mode_value)  
            except Exception as e:
                print(f"Error filling non-numeric column '{col}': {e}")

        print("Filled missing values: 0 for numeric, mode for others.")

    except Exception as e:
        print(f"Unexpected error: {e}")

    return df

In [166]:
def remove_duplicates(df):
    """Remove duplicate rows."""
    before = df.shape[0]
    df.drop_duplicates(inplace=True)
    after = df.shape[0]
    print(f"Removed {before - after} duplicate rows.")
    return df

In [167]:
def save_step(df, filename):
    output_file = "C:/Users/chand/OneDrive/Desktop/Python Project/Cleaned Data/" + filename
    df.to_csv(output_file, index=False)
    print(f"Saved: {output_file}")

In [168]:
def summarize(df):
    """Print summary of the cleaned dataset."""
    print("\nFinal Data Info:")
    df.info()
    print(f"\nFinal Shape: {df.shape}")

In [169]:
def clean_pipeline(file_path, strategy='median'):
    """Run the cleaning pipeline tailored for COVID dataset."""
    df = load_data(file_path)
    if df is None:
        return
    df = trim_whitespace(df)
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    cols=['iso_code','continent','location']
    df = drop_empty_rows(df, cols)
    df = handle_missing_values(df)
    df = remove_duplicates(df)
    save_step(df, "final_cleaned_data.csv")
    summarize(df)

In [170]:
if __name__ == "__main__":
    clean_pipeline(file_path)

Data loaded successfully.
Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166326 entries, 0 to 166325
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   iso_code                                    166326 non-null  object 
 1   continent                                   156370 non-null  object 
 2   location                                    166326 non-null  object 
 3   date                                        166326 non-null  object 
 4   total_cases                                 163293 non-null  float64
 5   new_cases                                   163133 non-null  float64
 6   new_cases_smoothed                          161150 non-null  float64
 7   total_deaths                                145451 non-null  float64
 8   new_deaths                                  145487 non-null  float64
 9   new_deaths_smoothed          