####    Data Cleaning

In [None]:
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocess_data(data_path, save_scaler=True):
    """
    Preprocesses real estate data for machine learning without loops.

    Args:
        data_path: Path to the CSV file containing the data.
        save_scaler: Boolean flag indicating whether to save the scaler object.

    Returns:
        A pandas DataFrame containing the preprocessed data.
    """

    # Read data
    data = pd.read_csv(data_path)

    # Define categorical and numerical columns directly
    categorical_feature = "subproperty_type"
    numerical_features = [
    'price',
    'longitude',
    'fl_garden',
    'fl_terrace',
    'nbr_bedrooms',
    'total_area_sqm',
    'latitude',
    'primary_energy_consumption_sqm'
    ]

    # One-hot encode categorical features with pd.get_dummies
    encoded_data = pd.get_dummies(data[[categorical_feature]], drop_first=True)

    # Create and fit StandardScaler
    scaler = StandardScaler()
    scaler.fit(data[numerical_features])

    # Scale numerical features and combine with encoded data
    preprocessed_data = pd.concat([
        pd.DataFrame(scaler.transform(data[numerical_features]), columns=numerical_features),
        encoded_data
    ], axis=1)

    # Handling missing values (consider imputation techniques)
    preprocessed_data = preprocessed_data.dropna()

    # Save scaler if needed
    if save_scaler:
        joblib.dump(scaler, "scaler.pkl")

    return preprocessed_data, scaler


# Example usage
preprocessed_data,scaler = preprocess_data("data\\properties.csv")
print(preprocessed_data.info)


#### Overall, this script trains a linear regression model to predict property prices based on various features like the number of exterior walls, presence of a terrace, and kitchen equipment.

1. Imports:

The script starts by importing various libraries essential for its operation:

joblib: Used for saving and loading the trained model and other artifacts.
pandas: Used for data manipulation and analysis.
sklearn.impute: Provides tools for handling missing values.
sklearn.linear_model: Contains the LinearRegression model being used.
sklearn.metrics: Used to evaluate the model's performance.
sklearn.model_selection: Provides tools for splitting data into training and testing sets.
sklearn.preprocessing: Includes the OneHotEncoder used for categorical variables.

2. train() function:

This function contains the core logic for training the model:

Loads the data: Reads the "properties.csv" file using pandas.
Defines features: Separates features into three categories: numerical, binary, and categorical.
Splits data: Divides data into training (80%) and testing (20%) sets.
Imputes missing values: Replaces missing values in numerical features using the mean.
One-hot encodes categorical features: Transforms categorical features into numerical binary columns.
Combines features: Merges numerical, binary, and one-hot encoded features into a single matrix.
Prints feature names: Lists the names of all features used in the model.
Trains the model: Creates and trains a LinearRegression model using the training data.
Evaluates the model: Calculates the R² score on both training and testing data to assess performance.
Saves the model: Saves the trained model, imputer, encoder, and feature information as a single file.

3. __main__ block:

This block ensures the train() function runs only when the script is executed directly, not when imported as a module.