## Pre-processing data
### 1. Split your data into categorical and numerical columns
### 2. One-Hot Encode Categorical Features: 
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
data_onehot = onehot_encoder.fit_transform(data[categorical_features])

### 3. Rescale Numerical Features (Optional):
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_imputed[numerical_features])

### 4. Combine Preprocessed Data:
#### Create a new DataFrame combining the one-hot encoded categories, imputed numerical features, and optionally scaled numerical features:
data_preprocessed = pd.concat([pd.DataFrame(data_onehot, columns=onehot_encoder.get_feature_names(categorical_features)),
                              pd.DataFrame(data_imputed[numerical_features], columns=numerical_features),
                              pd.DataFrame(data_scaled) if "data_scaled" in locals() else pd.DataFrame()], axis=1)

### 5. Create Reusable Pipeline:
#### Wrap the steps into a function like the previous example, ensuring the correct order:
def preprocess_data(data):
  """
  Preprocesses data for machine learning, considering MICE for missing values.

  Args:
    data: A pandas DataFrame containing the data to preprocess.

  Returns:
    A pandas DataFrame containing the preprocessed data, 
    a OneHotEncoder object, and a mice object.
  """

  # One-hot encode categorical features
  categorical_features = [col for col in data.columns if col.startswith("fl_")]
  onehot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
  data_onehot = onehot_encoder.fit_transform(data[categorical_features])

  # Impute missing values with MICE
  missing_vars = [col for col in data_onehot.columns] + [col for col in data.columns if not col.startswith("fl_") and not col.endswith("_sqm")]
  imputer = mice(data[missing_vars], printflag=False)
  data_imputed = imputer.complete()

  # Optionally scale numerical features
  numerical_features = [col for col in data.columns if not col.startswith("fl_") and not col.endswith("_sqm")]
  scaler = StandardScaler()
  data_scaled = scaler.fit_transform(data_imputed[numerical_features])

  # Combine preprocessed data
  data_preprocessed = pd.concat([pd.DataFrame(data_onehot, columns=onehot_encoder.get_feature_names(categorical_features)),
                                pd.DataFrame(data_imputed[numerical_features], columns=numerical_features),
                                pd.DataFrame(data_scaled) if "data_scaled" in locals() else pd.DataFrame()], axis=1)

  return data_preprocessed, onehot_encoder, imputer


MICE is available in several python libraries like mice and missForest. 
1. Import the library and initialize MICE
2. Impute missing values
3. Combine with remaining processing steps:
You can integrate the MICE imputation step into your existing pipeline by replacing the SimpleImputer step with MICE. Remember to adapt the missing_vars list based on the actual variables containing missing data in your dataset.

Important considerations:
MICE requires categorical variables to be one-hot encoded before imputation. Ensure you one-hot encode the relevant categorical features before applying MICE.
MICE generates multiple imputed datasets. Remember to combine them using appropriate techniques like pooling or Rubin's rules when interpreting your model results.

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv('data\\properties.csv')
# Split your data into categorical and numerical columns:
object_cols = data.select_dtypes(include=['object'])
# print(object_cols)
# Group property and subproperty type
print(data['subproperty_type'].unique())
missing_counts = data.isnull().sum()
print(missing_counts)
data['combined_type'] = data['property_type'] + '_' + data['subproperty_type']
# Handle special cases (e.g., OTHER_PROPERTY) if needed
data['combined_type'] = np.where(
    data['combined_type'].str.endswith('OTHER_PROPERTY'),
    'OTHER',
    np.where(data['combined_type'].str.endswith('EXCEPTIONAL_PROPERTY'),
             'EXCEPTIONAL',
             data['combined_type'])
)
print(data['combined_type'].unique())

numeric_cols = data.select_dtypes(include=['int64', 'float64'])



# One-hot encode categorical features using pd.get_dummies
encoded_object_cols = pd.get_dummies(object_cols, drop_first=True)
# Combine encoded object and numerical columns
combined_df = pd.concat([encoded_object_cols, numeric_cols], axis=1)


Which features to keep?
- mostly numerical datas
- categorical datas must add insights

Hence, we will retain the following columns in the properties dataset:
- 'total_area_sqm' in the most relevant and comes together with 'nbr_bedrooms' as key variables according to preliminary analysis 
- 'latitude' and 'longitude' on the other hand are a perfect example of numerical data compiling more information than 'region', 'province', 'locality' together
- similarly, 'primary_energy_consumption_sqm' looks more practical than 'EPC'

What happens to the initial dataset when we retain such features?

In [88]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('data\\properties.csv')

# Select these columns and drop rows with any NaN values
selected_columns = ['price', 'subproperty_type', 'latitude', 'longitude', 'total_area_sqm', 'nbr_bedrooms', 'fl_terrace', 'fl_garden', 'primary_energy_consumption_sqm']
df = df[selected_columns].dropna()

# Create one-hot encoded columns using get_dummies()
encoded_df = pd.get_dummies(df, columns=['subproperty_type'], drop_first=True)

import pandas as pd

import pandas as pd

def remove_outliers_zscore(df, threshold=4):

  """
  Removes outliers from all numerical columns in a DataFrame using the Z-score method.
  """

  df_filtered = df.copy()  # Create a copy to avoid modifying the original

  for col in df_filtered.select_dtypes(include=['int64', 'float64']):
    mean = df_filtered[col].mean()
    std = df_filtered[col].std()
    z_scores = abs((df_filtered[col] - mean) / std)

    filtered_indices = z_scores <= threshold  # Apply threshold
    df_filtered = df_filtered[filtered_indices]

  return df_filtered

# Example usage
df = pd.read_csv('data\\properties.csv')

df_filtered = remove_outliers_zscore(encoded_df)

print(df_filtered)

# Split your data into categorical and numerical columns:
object_cols = df.select_dtypes(include=['object'])
numerical_cols = df.select_dtypes(include=['int64', 'float64'])


          price   latitude  longitude  total_area_sqm  nbr_bedrooms  \
0      225000.0  51.217172   4.379982           100.0           2.0   
3      501000.0  51.238312   4.817192           187.0           3.0   
13     347000.0  51.024073   4.542449           102.0           2.0   
14     110000.0  50.393419   4.469180           150.0           2.0   
16     258000.0  50.615266   5.681833            83.0           1.0   
...         ...        ...        ...             ...           ...   
75500  350000.0  51.208940   4.411511            80.0           2.0   
75501  285000.0  51.079377   5.226001           185.0           4.0   
75503  249000.0  51.321260   4.947304           192.0           4.0   
75505  199000.0  50.583152   5.506517            89.0           2.0   
75507  780000.0  50.840183   4.435570           141.0           3.0   

       fl_terrace  fl_garden  primary_energy_consumption_sqm  \
0               1          0                           231.0   
3               0  