####    Data Cleaning

In [10]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_data(data_path, save_scaler=True):
    """
    Preprocesses real estate data for machine learning without loops.

    Args:
        data_path: Path to the CSV file containing the data.
        save_scaler: Boolean flag indicating whether to save the scaler object.

    Returns:
        A pandas DataFrame containing the preprocessed data.
    """

    # Read data
    data = pd.read_csv(data_path)

    # Define categorical and numerical columns directly
    categorical_feature = "subproperty_type"
    numerical_features = [
    'price',
    'total_area_sqm',
    'nbr_bedrooms',
    'latitude',
    'longitude',
    'fl_garden',
    'fl_terrace',
    'primary_energy_consumption_sqm'
    ]

    # One-hot encode categorical features with pd.get_dummies
    encoded_data = pd.get_dummies(data[[categorical_feature]], drop_first=True)

    # Create and fit StandardScaler
    scaler = StandardScaler()
    scaler.fit(data[numerical_features])
    scaled_numerical_data = scaler.transform(data[numerical_features])

    # Detect and remove outliers
    outlier_threshold = 3  # Adjust this threshold as needed
    filtered_data = pd.DataFrame(scaled_numerical_data, columns=numerical_features)
    for col in numerical_features:
        z_scores = (data[col] - np.mean(data[col])) / np.std(data[col]) # Calculate z-scores using NumPy functions
        filtered_data = filtered_data[np.abs(z_scores) <= outlier_threshold]
        print(f"Removed {len(filtered_data) - len(filtered_data)} outliers from column {col}.")

    # Combine scaled numerical data and encoded categorical features
    preprocessed_data = pd.concat([filtered_data, encoded_data], axis=1)

    # Handling missing values (consider imputation techniques)
    preprocessed_data = preprocessed_data.dropna()

    # Save scaler if needed
    if save_scaler:
        joblib.dump(scaler, "scaler.pkl")

    return preprocessed_data, scaler


# Example usage
preprocessed_data,scaler = preprocess_data("data\\properties.csv")
print(preprocessed_data.info)


Removed 0 outliers from column price.
Removed 0 outliers from column total_area_sqm.
Removed 0 outliers from column nbr_bedrooms.
Removed 0 outliers from column latitude.
Removed 0 outliers from column longitude.
Removed 0 outliers from column fl_garden.
Removed 0 outliers from column fl_terrace.
Removed 0 outliers from column primary_energy_consumption_sqm.
<bound method DataFrame.info of           price  total_area_sqm  nbr_bedrooms  latitude  longitude  fl_garden  \
0     -0.451165       -0.153111     -0.415630  0.945167   0.053972  -0.528432   
3      0.178460        0.056100      0.112304  1.005614   0.513761  -0.528432   
13    -0.172852       -0.148301     -0.415630  0.393018   0.224829  -0.528432   
14    -0.713509       -0.032874     -0.415630 -1.410269   0.147776  -0.528432   
16    -0.375884       -0.193991     -0.943564 -0.775921   1.423053  -0.528432   
...         ...             ...           ...       ...        ...        ...   
75500 -0.166009       -0.201205     -0.4

  filtered_data = filtered_data[np.abs(z_scores) <= outlier_threshold]
  filtered_data = filtered_data[np.abs(z_scores) <= outlier_threshold]
  filtered_data = filtered_data[np.abs(z_scores) <= outlier_threshold]
  filtered_data = filtered_data[np.abs(z_scores) <= outlier_threshold]
  filtered_data = filtered_data[np.abs(z_scores) <= outlier_threshold]
  filtered_data = filtered_data[np.abs(z_scores) <= outlier_threshold]
  filtered_data = filtered_data[np.abs(z_scores) <= outlier_threshold]


#### Overall, this script trains a linear regression model to predict property prices based on various features like the number of exterior walls, presence of a terrace, and kitchen equipment.

1. Imports:

The script starts by importing various libraries essential for its operation:

joblib: Used for saving and loading the trained model and other artifacts.
pandas: Used for data manipulation and analysis.
sklearn.impute: Provides tools for handling missing values.
sklearn.linear_model: Contains the LinearRegression model being used.
sklearn.metrics: Used to evaluate the model's performance.
sklearn.model_selection: Provides tools for splitting data into training and testing sets.
sklearn.preprocessing: Includes the OneHotEncoder used for categorical variables.

2. train() function:

This function contains the core logic for training the model:

Loads the data: Reads the "properties.csv" file using pandas.
Defines features: Separates features into three categories: numerical, binary, and categorical.
Splits data: Divides data into training (80%) and testing (20%) sets.
Imputes missing values: Replaces missing values in numerical features using the mean.
One-hot encodes categorical features: Transforms categorical features into numerical binary columns.
Combines features: Merges numerical, binary, and one-hot encoded features into a single matrix.
Prints feature names: Lists the names of all features used in the model.
Trains the model: Creates and trains a LinearRegression model using the training data.
Evaluates the model: Calculates the R² score on both training and testing data to assess performance.
Saves the model: Saves the trained model, imputer, encoder, and feature information as a single file.

3. __main__ block:

This block ensures the train() function runs only when the script is executed directly, not when imported as a module.