<a href="https://colab.research.google.com/github/dhanangwinarnochb/UPH-Visualisasi-Analitik-Dhanang/blob/main/Practice_2_Data_Pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ipywidgets pandas scikit-learn

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [None]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# Sample data: IoT sensors in various zones of a smart city
zones = ['Downtown', 'Residential', 'Industrial', 'Suburban', 'Park']

# Generating timestamp for the last 30 days
dates = pd.date_range(end=pd.Timestamp.today(), periods=15, freq='D')

# Creating the dataset
data = {
    'Zone': np.random.choice(zones, size=15),
    'Date': dates,
    'Air_Quality_Index': np.random.randint(50, 300, size=15),  # AQI: 50 (Good) to 300+ (Hazardous)
    'Noise_Level_dB': np.random.randint(40, 100, size=15),  # Noise level in dB
    'Traffic_Density': np.random.randint(10, 200, size=15),  # Number of vehicles per square km
    'Energy_Consumption_kWh': np.round(np.random.uniform(1000, 5000, size=15), 2),  # kWh consumed per zone
    'Temperature_C': np.round(np.random.uniform(15, 35, size=15), 2),  # Temperature in Celsius
    'Humidity_%': np.round(np.random.uniform(30, 90, size=15), 2)  # Humidity percentage
}

df = pd.DataFrame(data)

# Introduce missing data randomly in the dataset
def introduce_missing_data(df, missing_percentage=0.1):
    # Flatten the DataFrame into an array
    flattened_df = df.values.flatten()

    # Randomly select a subset of the data to replace with NaN
    total_values = len(flattened_df)
    num_missing = int(missing_percentage * total_values)

    # Get random indices for missing data
    missing_indices = np.random.choice(total_values, num_missing, replace=False)

    # Assign missing to the selected indices
    flattened_df[missing_indices] = np.nan

    # Reshape it back to the original shape
    df_with_missing = pd.DataFrame(flattened_df.reshape(df.shape), columns=df.columns)

    return df_with_missing

# Introduce 10% missing data
df = introduce_missing_data(df, missing_percentage=0.1)
# Ensure all columns are numeric
for col in df.columns:
    try:
        if col != 'Date' and col != 'Zone':
          df[col] = pd.to_numeric(df[col], errors='raise')  # This will raise an error if a non-numeric value is present
    except ValueError as e:
        None
# Display the DataFrame with missing values
df


Unnamed: 0,Zone,Date,Air_Quality_Index,Noise_Level_dB,Traffic_Density,Energy_Consumption_kWh,Temperature_C,Humidity_%
0,Suburban,2026-01-22 10:54:44.625754,179,80,,1278.06,33.9,70.27
1,Downtown,NaT,199,77,,1804.22,24.92,89.26
2,Park,2026-01-24 10:54:44.625754,94,72,43.0,1340.6,25.65,53.48
3,,2026-01-25 10:54:44.625754,165,87,115.0,3082.36,21.49,40.85
4,Residential,2026-01-26 10:54:44.625754,132,50,35.0,4839.22,27.03,73.16
5,Suburban,2026-01-27 10:54:44.625754,297,87,151.0,4087.2,16.89,
6,Downtown,NaT,282,80,180.0,3617.52,31.81,81.77
7,Park,2026-01-29 10:54:44.625754,276,40,148.0,1940.18,,38.08
8,Industrial,2026-01-30 10:54:44.625754,284,50,21.0,2184.2,30.3,49.94
9,,2026-01-31 10:54:44.625754,68,49,173.0,,26.95,52.83


In [None]:
# Widget for handling missing values
handle_missing = widgets.Dropdown(
    options=['Drop rows', 'Fill with mean', 'Fill with median', 'Fill with mode'],
    value='Drop rows',
    description='Handle Missing:',
)

# Function to clean data based on the widget's input
def clean_data(option):
    if option == 'Drop rows':
        df_cleaned = df.dropna()
    elif option == 'Fill with mean':
        df_cleaned = df.fillna(df.mean(numeric_only=True))
    elif option == 'Fill with median':
        df_cleaned = df.fillna(df.median(numeric_only=True))
    elif option == 'Fill with mode':
        df_cleaned = df.fillna(df.mode().iloc[0])

    display(df_cleaned)

# Interactive display
widgets.interact(clean_data, option=handle_missing)


interactive(children=(Dropdown(description='Handle Missing:', options=('Drop rows', 'Fill with mean', 'Fill wi…

In [None]:
# Widget for choosing transformation
transformation_type = widgets.Dropdown(
    options=['One-Hot Encode Zone', 'Scale AQI and Noise'],
    value='Scale AQI and Noise',
    description='Transform:',
)

# Function to perform transformations
def transform_data(option):
    if option == 'One-Hot Encode Zone':
        df_transformed = pd.get_dummies(df, columns=['Zone'], drop_first=True)
    elif option == 'Scale AQI and Noise':
        scaler = MinMaxScaler()
        df_transformed = df.copy()
        df_transformed[['Air_Quality_Index', 'Noise_Level_dB']] = scaler.fit_transform(df[['Air_Quality_Index', 'Noise_Level_dB']])


    display(df_transformed)

# Interactive display
widgets.interact(transform_data, option=transformation_type)


interactive(children=(Dropdown(description='Transform:', index=1, options=('One-Hot Encode Zone', 'Scale AQI a…

In [None]:
# Widget for normalization
normalization_type = widgets.Dropdown(
    options=['Min-Max Normalization', 'Standardization (Z-Score)'],
    value='Min-Max Normalization',
    description='Normalization:',
)

# Function to normalize data
def normalize_data(option):
    df_normalized = df[['Air_Quality_Index', 'Noise_Level_dB']].copy()

    if option == 'Min-Max Normalization':
        scaler = MinMaxScaler()
    elif option == 'Standardization (Z-Score)':
        scaler = StandardScaler()

    df_normalized[['Air_Quality_Index', 'Noise_Level_dB']] = scaler.fit_transform(df[['Air_Quality_Index', 'Noise_Level_dB']])

    display(df_normalized)

# Interactive display
widgets.interact(normalize_data, option=normalization_type)


interactive(children=(Dropdown(description='Normalization:', options=('Min-Max Normalization', 'Standardizatio…

In [None]:
encoding_type = widgets.ToggleButtons(
    options=['Label Encoding', 'One-Hot Encoding'],
    description='Encoding:',
)

def show_encoding(method):
    df_enc = df.copy()
    if method == 'Label Encoding':
        df_enc['Zone_Encoded'] = df_enc['Zone'].astype('category').cat.codes
    else:
        df_enc = pd.get_dummies(df_enc, columns=['Zone'], drop_first=True)

    display(df_enc.head())

widgets.interact(show_encoding, method=encoding_type)

interactive(children=(ToggleButtons(description='Encoding:', options=('Label Encoding', 'One-Hot Encoding'), v…

In [None]:
# Answer this question
# 1. The dataset contains missing values. If you were a data analyst working with this data, which method (drop, mean, median, or mode) would you choose to handle missing values? Justify your choice with potential benefits and drawbacks.
# 2. The dataset includes both Air Quality Index (AQI) and Noise Level (dB). How does scaling (Min-Max vs. Standardization Z-Score) affect the interpretation of these values? Which scaling method would be more appropriate for this dataset and why?
# 3. Describe the difference between One-Hot Encoding and Label Encoding (assigning each zone a number 1-5).
# 4. In the context of this specific dataset (Downtown, Residential, etc.), why might One-Hot Encoding be safer than Label Encoding?

# Write in docs and submit the pdf in Moodle.