<a href="https://colab.research.google.com/github/gousebashask/dataannalysis/blob/main/data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from scipy.stats import zscore

# Load the dataset from a CSV file
def load_dataset(file_path):
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"File not found at '{file_path}'.")
        return None

# Step 1: Identify and Handle Missing Data
def handle_missing_data(df):
    print("Step 1: Handling missing data")
    print("Before handling missing data:")
    print(df.head())

    df_cleaned = df.dropna()

    print("\nAfter handling missing data:")
    print(df_cleaned.head())

    return df_cleaned

# Step 2: Remove Duplicates
def remove_duplicates(df):
    print("\nStep 2: Removing duplicates")
    print("Before removing duplicates:")
    print(df.head())

    df_no_duplicates = df.drop_duplicates()

    print("\nAfter removing duplicates:")
    print(df_no_duplicates.head())

    return df_no_duplicates

# Step 3: Detect and Address Outliers
def remove_outliers(df, threshold=3):
    print("\nStep 3: Detecting and addressing outliers")
    # Compute Z-scores for numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    z_scores = np.abs(zscore(df[numerical_cols]))

    # Remove rows where any numerical feature has a Z-score greater than the threshold
    df_no_outliers = df[(z_scores < threshold).all(axis=1)]

    print("\nAfter outlier detection and removal:")
    print(df_no_outliers.head())

    return df_no_outliers

# Step 4: Perform Basic Data Transformations
def preprocess_data(input_file, output_file):
    # Load the dataset
    df = load_dataset(input_file)

    if df is not None:
        # Step 1: Handle missing data
        df = handle_missing_data(df)

        # Step 2: Remove duplicates
        df = remove_duplicates(df)

        # Step 3: Detect and address outliers
        df = remove_outliers(df)

        # Step 4a: Scale numerical variables
        numerical_cols = df.select_dtypes(include=[np.number]).columns
        if not numerical_cols.empty:
            scaler = MinMaxScaler()
            df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

        # Save the cleaned and preprocessed dataset
        df.to_csv(output_file, index=False)
        print("\nData cleaning and preprocessing completed. Cleaned dataset saved as '{}'.".format(output_file))

if __name__ == "__main__":
    input_file = "data.csv"  # Change this to your input CSV file
    output_file = "cleaned_data.csv"  # Change this to the desired output CSV file

    preprocess_data(input_file, output_file)


Step 1: Handling missing data
Before handling missing data:
  Observation  Y-Kappa  ChipRate  BF-CMratio  BlowFlow  ChipLevel4   \
0    31-00:00    23.10    16.520     121.717  1177.607      169.805   
1    31-01:00    27.60    16.810      79.022  1328.360      341.327   
2    31-02:00    23.19    16.709      79.562  1329.407      239.161   
3    31-03:00    23.60    16.478      81.011  1334.877      213.527   
4    31-04:00    22.90    15.618      93.244  1334.168      243.131   

   T-upperExt-2   T-lowerExt-2    UCZAA  WhiteFlow-4   ...  SteamFlow-4   \
0        358.282         329.545  1.443       599.253  ...        67.122   
1        351.050         329.067  1.549       537.201  ...        60.012   
2        350.022         329.260  1.600       549.611  ...        61.304   
3        350.938         331.142  1.604       623.362  ...        68.496   
4        351.640         332.709    NaN       638.672  ...        70.022   

   Lower-HeatT-3  Upper-HeatT-3   ChipMass-4   WeakLiquo