In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set(style="whitegrid")

# Load and Show dataset Size

In [None]:
df = pd.read_csv('initial_data.csv')
df.shape

In [None]:
df_small = df.sample(frac = 0.2, random_state=123)
df_small.shape

In [None]:
df = df_small

In [None]:
print(f'Sample size: {len(df)} rows')
print(df.head())

# Check the size of the DataFrame
num_rows, num_cols = df.shape
total_elements = num_rows * num_cols
print(f"Initial dataset size: {num_rows} rows, {num_cols} columns, {total_elements} total elements.")

# Checking for noise in the data Using EDA

In [None]:
df

In [None]:
def perform_eda(df):
    """
    Performs Exploratory Data Analysis to identify and visualize noise in the dataset.

    Parameters:
    - df (pd.DataFrame): The DataFrame to analyze.
    """
    # Display basic information about the dataset
    print("Dataset Information:")
    print(df.info())
    print("\n")

    # Display summary statistics
    print("Summary Statistics:")
    print(df.describe(include='all'))
    print("\n")

    # Check for missing values
    missing_values = df.isnull().sum()
    print("Missing Values per Column:")
    print(missing_values)
    print("\n")

    # Visualize missing data
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Heatmap of Missing Values')
    plt.show()
    print("The heatmap above shows the distribution of missing values across the dataset. Columns with more missing values will have more yellow lines.")

    # Visualize the distribution of numerical columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        plt.figure(figsize=(12, 6))
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()
        print(f"The histogram above shows the distribution of '{col}'. Skewness or irregularities may indicate noise.")

        # Boxplot to detect outliers
        plt.figure(figsize=(12, 6))
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot of {col}')
        plt.xlabel(col)
        plt.show()
        print(f"The boxplot above for '{col}' helps identify outliers. Points outside the whiskers are potential outliers.")

        # Outlier Detection using IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        if IQR == 0:
            print(f"No variation in '{col}'; skipping outlier detection.\n")
            continue
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        num_outliers = outliers.shape[0]
        print(f"Number of outliers in '{col}': {num_outliers}\n")

    # Visualize correlations between numerical variables
    if len(numeric_cols) >= 2:
        plt.figure(figsize=(12, 6))
        corr_matrix = df[numeric_cols].corr()
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
        plt.title('Correlation Matrix of Numerical Variables')
        plt.show()
        print("The correlation matrix above shows the correlation coefficients between numerical variables. High correlations may indicate multicollinearity.")

    # Check for duplicates
    num_duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {num_duplicates}")
    if num_duplicates > 0:
        print("There are duplicate rows in the dataset, indicating data redundancy or duplication.\n")
    else:
        print("There are no duplicate rows in the dataset.\n")

    # Analyze categorical variables
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        num_unique_values = df[col].nunique()
        print(f"Categorical Variable '{col}' has {num_unique_values} unique values.")
        if num_unique_values < 20:
            unique_values = df[col].unique()
            print(f"Unique values in '{col}': {unique_values}\n")
        else:
            print(f"'{col}' has many unique values; we should consider checking for inconsistencies or encoding if necessary.\n")

In [None]:
perform_eda(df)

# Handling the Noise

In [None]:
def clean_data(df):
    """
    Cleans the dataset by removing rows with missing values and handling duplicates and outliers.

    Parameters:
    - df (pd.DataFrame): The DataFrame to clean.

    Returns:
    - pd.DataFrame: The cleaned DataFrame.
    """
    df_cleaned = df.copy()

    # Remove rows with any missing values
    print("Removing rows with missing values...")
    initial_row_count = df_cleaned.shape[0]
    df_cleaned = df_cleaned.dropna()
    final_row_count = df_cleaned.shape[0]
    rows_removed = initial_row_count - final_row_count
    print(f"Removed {rows_removed} rows due to missing values.")

    # Remove duplicates
    print("\nRemoving duplicate rows...")
    initial_row_count = df_cleaned.shape[0]
    df_cleaned = df_cleaned.drop_duplicates()
    final_row_count = df_cleaned.shape[0]
    duplicates_removed = initial_row_count - final_row_count
    print(f"Removed {duplicates_removed} duplicate rows.")

    # Handle outliers in numerical columns by removing them
    print("\nHandling outliers...")
    numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        if IQR == 0:
            print(f"No variation in '{col}'; skipping outlier handling.")
            continue
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Remove rows with outliers
        outliers = df_cleaned[(df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)]
        num_outliers = outliers.shape[0]
        df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]
        print(f"Removed {num_outliers} outliers from '{col}'.")

    # Handle inconsistencies in categorical variables
    print("\nHandling inconsistencies in categorical variables...")
    categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        # Convert to lowercase and strip whitespaces
        df_cleaned[col] = df_cleaned[col].str.lower().str.strip()
        print(f"Standardized text in '{col}' by converting to lowercase and stripping whitespaces.")

    print("\nData cleaning completed.")

    # Print unique values of each column to verify
    print("\nUnique values in each column after cleaning:")
    for col in df_cleaned.columns:
        num_unique_values = df_cleaned[col].nunique()
        print(f"Column '{col}' has {num_unique_values} unique values.")

    return df_cleaned

In [None]:
df_cleaned = clean_data(df)

In [None]:
x, y = df_cleaned.shape
total_elements = x * y
total_elements

In [None]:
# Saving the cleaned dataset to a new CSV file
output_file_path = 'cleaned_data.csv'
df_cleaned.to_csv(output_file_path, index=False)

In [None]:
perform_eda(df_cleaned)

In [None]:
# print rows having the substring nan in any column
print(df_cleaned[df_cleaned.isin(['nan']).any(axis=1)])
# Empty

# Data Cleaned!!

In [None]:
# print col datatypes
df_cleaned.dtypes