# IMPORT

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [45]:
file_path = 'Housing.csv'

# SECTION 1: LOAD DATASET

In [46]:
# ============================
# SECTION 1: LOAD DATASET
# ============================
def load_data(file_path):
    """
    Load dataset from a CSV file and return a pandas DataFrame.
    """
    print("\n[INFO] Loading dataset...")
    df = pd.read_csv(file_path)  # Read the CSV file into a DataFrame
    print("[SUCCESS] Dataset loaded successfully! Shape:", df.shape, "\n")  # Print dataset shape
    print(df.head())  # Display first 5 rows for preview
    print("[INFO] Dataset Info:")
    print(df.info())  # Show dataset info including column types and missing values
    return df

df = load_data(file_path)



[INFO] Loading dataset...
[SUCCESS] Dataset loaded successfully! Shape: (545, 13) 

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
[INFO] Dataset

# SECTION 2: HANDLE MISSING VALUES

In [47]:
# SECTION 2: HANDLE MISSING VALUES
# ============================
def handle_missing_values(df):
    """
    Fill missing values in the dataset:
    - Numeric columns: fill with mean
    - Categorical columns: fill with most frequent value (mode)
    """
    print("\n[INFO] Handling missing values...")
    print("[BEFORE] Missing values per column:\n", df.isnull().sum(), "\n")  # Show missing values before processing

    numeric_cols = df.select_dtypes(include=[np.number]).columns  # Get numeric columns
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns  # Get categorical columns

    # Fill missing values in numeric columns with mean
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # Fill missing values in categorical columns with most frequent value
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

    print("[AFTER] Missing values handled! Numeric cols:", len(numeric_cols), "Categorical cols:", len(categorical_cols), "\n")
    print(df.head())  # Display changes in data after handling missing values
    return df

df = handle_missing_values(df)


[INFO] Handling missing values...
[BEFORE] Missing values per column:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64 

[AFTER] Missing values handled! Numeric cols: 6 Categorical cols: 7 

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0          

# SECTION 3: ENCODE CATEGORICAL FEATURES

In [48]:
# ============================
# SECTION 3: ENCODE CATEGORICAL FEATURES
# ============================
def encode_categorical_features(df):
    """
    Convert categorical columns into numerical format using One-Hot Encoding.
    """
    print("\n[INFO] Encoding categorical features...")
    print("[BEFORE] Columns:", df.columns.tolist(), "\n")  # Show column names before encoding

    df = pd.get_dummies(df, drop_first=True)  # Apply One-Hot Encoding

    print("[AFTER] Categorical features encoded! New shape:", df.shape, "\n")
    print(df.head())  # Display first rows after encoding
    return df
df = encode_categorical_features(df)


[INFO] Encoding categorical features...
[BEFORE] Columns: ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus'] 

[AFTER] Categorical features encoded! New shape: (545, 14) 

      price  area  bedrooms  bathrooms  stories  parking  mainroad_yes  \
0  13300000  7420         4          2        3        2          True   
1  12250000  8960         4          4        4        3          True   
2  12250000  9960         3          2        2        2          True   
3  12215000  7500         4          2        2        3          True   
4  11410000  7420         4          1        2        2          True   

   guestroom_yes  basement_yes  hotwaterheating_yes  airconditioning_yes  \
0          False         False                False                 True   
1          False         False                False                 True   
2          False          True   

# SECTION 4: SCALE NUMERIC FEATURES

In [49]:
# ============================
# SECTION 4: SCALE NUMERIC FEATURES
# ============================
def scale_features(df, method="standard"):
    """
    Scale numerical features using either StandardScaler or MinMaxScaler.
    """
    print("\n[INFO] Scaling numeric features using", method, "scaler...")

    numeric_cols = df.select_dtypes(include=[np.number]).columns  # Get numeric columns
    scaler = StandardScaler() if method == "standard" else MinMaxScaler()  # Choose scaler
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])  # Apply scaling

    print("[SUCCESS] Features scaled using", method, "\n")
    print("[INFO] Dataset statistics after scaling:")
    print(df.describe())  # Show statistics after scaling
    return df
df = scale_features(df, method="minmax")  # method="minmax" method="standard"


[INFO] Scaling numeric features using minmax scaler...
[SUCCESS] Features scaled using minmax 

[INFO] Dataset statistics after scaling:
            price        area    bedrooms   bathrooms     stories     parking
count  545.000000  545.000000  545.000000  545.000000  545.000000  545.000000
mean     0.261189    0.240587    0.393028    0.095413    0.268502    0.231193
std      0.161943    0.149151    0.147613    0.167490    0.289164    0.287195
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000
25%      0.145455    0.134021    0.200000    0.000000    0.000000    0.000000
50%      0.224242    0.202749    0.400000    0.000000    0.333333    0.000000
75%      0.345455    0.323711    0.400000    0.333333    0.333333    0.333333
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000


# SECTION 5 Handle outliers

In [50]:

def handle_outliers(df, method="iqr"):
    """
    Detect and handle outliers in numeric columns.
    - method="iqr": Remove outliers beyond 1.5*IQR.
    - method="winsorize": Replace extreme values with 5th/95th percentiles.
    """
    print("\n[INFO] Handling outliers...")

    numeric_cols = df.select_dtypes(include=[np.number]).columns  # Get numeric columns

    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)  # 25th percentile
        Q3 = df[col].quantile(0.75)  # 75th percentile
        IQR = Q3 - Q1  # Interquartile range

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        if method == "iqr":
            # Remove outliers
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        elif method == "winsorize":
            # Replace extreme values
            df[col] = np.where(df[col] < lower_bound, Q1, df[col])
            df[col] = np.where(df[col] > upper_bound, Q3, df[col])

    print("[SUCCESS] Outliers handled using method:", method)
    print(df.head())  # Display the first rows after handling outliers
    return df

# Example usage
df = handle_outliers(df, method="iqr")


[INFO] Handling outliers...
[SUCCESS] Outliers handled using method: iqr
       price      area  bedrooms  bathrooms   stories   parking  mainroad_yes  \
15  0.636364  0.298969       0.6        0.0  0.333333  0.666667          True   
20  0.606061  0.183505       0.4        0.0  0.333333  0.666667          True   
22  0.596970  0.439863       0.4        0.0  0.000000  0.333333          True   
27  0.575758  0.496564       0.4        0.0  0.000000  0.333333          True   
40  0.530303  0.336770       0.4        0.0  0.333333  0.000000          True   

    guestroom_yes  basement_yes  hotwaterheating_yes  airconditioning_yes  \
15          False          True                False                False   
20          False          True                 True                False   
22           True          True                False                 True   
27          False         False                False                False   
40          False          True                False  

# Section 6 Apply log transformation

In [51]:
def log_transform(df, columns):
    """
    Apply log transformation to specified columns to reduce skewness.
    - columns: list of column names to transform.
    """
    print("\n[INFO] Applying log transformation...")

    for col in columns:
        if col in df.columns:
            df[col] = np.log1p(df[col])  # log1p avoids issues with zero values

    print("[SUCCESS] Log transformation applied to:", columns)
    print(df.head())
    return df

# Example usage
log_cols = ["income", "loan_amount", "monthly_payment"]  # Adjust based on skewed data
df = log_transform(df, log_cols)


[INFO] Applying log transformation...
[SUCCESS] Log transformation applied to: ['income', 'loan_amount', 'monthly_payment']
       price      area  bedrooms  bathrooms   stories   parking  mainroad_yes  \
15  0.636364  0.298969       0.6        0.0  0.333333  0.666667          True   
20  0.606061  0.183505       0.4        0.0  0.333333  0.666667          True   
22  0.596970  0.439863       0.4        0.0  0.000000  0.333333          True   
27  0.575758  0.496564       0.4        0.0  0.000000  0.333333          True   
40  0.530303  0.336770       0.4        0.0  0.333333  0.000000          True   

    guestroom_yes  basement_yes  hotwaterheating_yes  airconditioning_yes  \
15          False          True                False                False   
20          False          True                 True                False   
22           True          True                False                 True   
27          False         False                False                False   
40 

# Section 7 Scale features using RobustScaler

In [52]:
from sklearn.preprocessing import RobustScaler

def scale_features_robust(df):
    """
    Scale numeric features using RobustScaler to handle outliers.
    """
    print("\n[INFO] Scaling numeric features using RobustScaler...")

    numeric_cols = df.select_dtypes(include=[np.number]).columns  # Get numeric columns
    scaler = RobustScaler()

    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])  # Apply scaling

    print("[SUCCESS] Features scaled using RobustScaler")
    print(df.head())
    return df

# Example usage
df = scale_features_robust(df)


[INFO] Scaling numeric features using RobustScaler...
[SUCCESS] Features scaled using RobustScaler
       price      area  bedrooms  bathrooms  stories  parking  mainroad_yes  \
15  3.409091  0.820940       1.0        0.0      1.0      2.0          True   
20  3.181818  0.102991       0.0        0.0      1.0      2.0          True   
22  3.113636  1.697009       0.0        0.0      0.0      1.0          True   
27  2.954545  2.049573       0.0        0.0      0.0      1.0          True   
40  2.613636  1.055983       0.0        0.0      1.0      0.0          True   

    guestroom_yes  basement_yes  hotwaterheating_yes  airconditioning_yes  \
15          False          True                False                False   
20          False          True                 True                False   
22           True          True                False                 True   
27          False         False                False                False   
40          False          True         

#  Section 8  Select important features based on variance threshold and VIF

In [53]:
from sklearn.feature_selection import VarianceThreshold
from statsmodels.stats.outliers_influence import variance_inflation_factor

def feature_selection(df, variance_threshold=0.01, vif_threshold=10):
    """
    Select features based on variance threshold and VIF.
    - variance_threshold: Remove features with very low variance.
    - vif_threshold: Remove features with high collinearity (VIF > threshold).
    """
    print("\n[INFO] Performing feature selection...")

    # Remove low-variance features
    selector = VarianceThreshold(threshold=variance_threshold)
    df = df.loc[:, selector.fit(df).get_support()]

    # Calculate Variance Inflation Factor (VIF)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    vif_data = pd.DataFrame()
    vif_data["feature"] = numeric_cols
    vif_data["VIF"] = [variance_inflation_factor(df[numeric_cols].values, i) for i in range(len(numeric_cols))]

    # Drop features with high VIF
    high_vif_features = vif_data[vif_data["VIF"] > vif_threshold]["feature"].tolist()
    df = df.drop(columns=high_vif_features)

    print("[SUCCESS] Feature selection complete. Removed:", high_vif_features)
    print(df.head())
    return df

# Example usage
df = feature_selection(df)


[INFO] Performing feature selection...
[SUCCESS] Feature selection complete. Removed: []
       price      area  bedrooms  stories  parking  mainroad_yes  \
15  3.409091  0.820940       1.0      1.0      2.0          True   
20  3.181818  0.102991       0.0      1.0      2.0          True   
22  3.113636  1.697009       0.0      0.0      1.0          True   
27  2.954545  2.049573       0.0      0.0      1.0          True   
40  2.613636  1.055983       0.0      1.0      0.0          True   

    guestroom_yes  basement_yes  hotwaterheating_yes  airconditioning_yes  \
15          False          True                False                False   
20          False          True                 True                False   
22           True          True                False                 True   
27          False         False                False                False   
40          False          True                False                 True   

    prefarea_yes  furnishingstatus_sem

#  Section  9 Final check for NaN/Inf

In [63]:
import numpy as np
import pandas as pd

# ============================
# SECTION 5: FULL DATA PREPROCESSING PIPELINE
# ============================
def preprocess_data(file_path):
    """
    Full data preprocessing pipeline:
    1. Load data
    2. Handle missing values
    3. Encode categorical features
    4. Scale numeric features
    5. Handle outliers
    6. Apply log transformation
    7. Scale features using RobustScaler
    8. Select important features
    9. Final check for NaN/Inf
    """
    print("\n====================")
    print("[START] Data Preprocessing Pipeline")
    print("====================\n")

    df = load_data(file_path)  # Load dataset
    df = handle_missing_values(df)  # Handle missing values
    check_nan_inf(df, "After handling missing values")

    df = encode_categorical_features(df)  # Encode categorical features
    check_nan_inf(df, "After encoding categorical features")

    df = scale_features(df, method="standard")  # Initial scaling
    check_nan_inf(df, "After initial scaling")

    df = handle_outliers(df, method="iqr")  # Handle outliers
    check_nan_inf(df, "After handling outliers")

# Select only numerical columns that contain strictly positive values
log_cols = [col for col in df.select_dtypes(include=[np.number]).columns if (df[col] > 0).all()]

if log_cols:  # Check if there are valid columns for log transformation
    df[log_cols] = df[log_cols].apply(lambda x: np.log1p(x.clip(lower=0.00001)))  # Apply log transformation with lower bound to prevent log(0)
    check_nan_inf(df, "After log transformation")  # Verify NaNs and Infs after transformation
else:
    print("[INFO] No valid columns for log transformation.")  # Log message if no suitable columns found

    df = scale_features_robust(df)  # RobustScaler
    check_nan_inf(df, "After RobustScaler scaling")

    df = feature_selection(df)  # Feature selection
    check_nan_inf(df, "After feature selection")

    # Final NaN and Inf check
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(numeric_only=True), inplace=True)
    check_nan_inf(df, "Final check before output")

    print("====================")
    print("[COMPLETE] Data Preprocessing Finished!")
    print("====================\n")

# ============================
# Helper function: Check for NaN and Inf
# ============================
def check_nan_inf(df, step):
    num_df = df.select_dtypes(include=[np.number])  # Только числовые данные
    nan_count = num_df.isna().sum().sum()
    inf_count = np.isinf(num_df.values).sum()

    print(f"[DEBUG] {step}: NaNs = {nan_count} | Infs = {inf_count}")

    if nan_count > 0 or inf_count > 0:
        print("[WARNING] Issues found! Handling NaNs and Infs...")
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.fillna(df.median(numeric_only=True), inplace=True)
        print("[INFO] NaN and Inf values replaced successfully.")

    return df


[INFO] No valid columns for log transformation.

[INFO] Scaling numeric features using RobustScaler...
[SUCCESS] Features scaled using RobustScaler
       price      area  bedrooms  stories  parking  mainroad_yes  \
15  3.409091  0.820940       1.0      1.0      2.0          True   
20  3.181818  0.102991       0.0      1.0      2.0          True   
22  3.113636  1.697009       0.0      0.0      1.0          True   
27  2.954545  2.049573       0.0      0.0      1.0          True   
40  2.613636  1.055983       0.0      1.0      0.0          True   

    guestroom_yes  basement_yes  hotwaterheating_yes  airconditioning_yes  \
15          False          True                False                False   
20          False          True                 True                False   
22           True          True                False                 True   
27          False         False                False                False   
40          False          True                False      

# Section 10  FULL DATA PREPROCESSING PIPELINE

In [64]:
# ============================
# SECTION 7: EXAMPLE USAGE
# ============================

# Uncomment to run:
processed_df = preprocess_data("Housing.csv")


[START] Data Preprocessing Pipeline


[INFO] Loading dataset...
[SUCCESS] Dataset loaded successfully! Shape: (545, 13) 

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2     