# Machine Learning Project : DATASET GENERATION

## 1. Multivariate Linear Regression

In [None]:
import numpy as np
import pandas as pd

# For reproducibility: ensures the same random values are generated each time the script is run
np.random.seed(42)

# ============================
# 1. DATASET for Multivariate Linear Regression: RENTAL PRICES
# ============================

n1 = 1000  # number of observations


# Features :

# --------------------------------------------
# Generate base apartment size (in square meters)
# --------------------------------------------
size_sqm = np.random.normal(loc=60, scale=20, size=n1)  # average size around 60 sqm
size_sqm = np.clip(size_sqm, 15, 200)  # enforce realistic boundaries


# --------------------------------------------
# Generate neighborhood-related variables
# --------------------------------------------
neighborhood_income_index = np.random.beta(a=2, b=2, size=n1)  # socioeconomic level of the area (between 0 and 1)
noise_level_index = np.random.beta(a=2, b=5, size=n1)  # noise exposure in the area (between 0 and 1), values skewed toward low noise

# --------------------------------------------
# Make size_sqm correlated with neighborhood income
# --------------------------------------------
size_sqm = size_sqm + (neighborhood_income_index * 15)  # Richer areas → slightly larger apartments
size_sqm = np.clip(size_sqm, 15, 200)

# --------------------------------------------
# Generate num_bedrooms based on apartment size
# Larger surfaces → more likely to have more bedrooms
# --------------------------------------------
num_bedrooms = []

for s in size_sqm:
    # We define different probability profiles depending on the surface
    if s < 30:
        # Very small units → mostly 1 bedroom
        probs = [0.85, 0.13, 0.02, 0.0, 0.0]  # probabilities for [1,2,3,4,5] bedrooms
    elif s < 50:
        # Small but livable → mostly 1 or 2 bedrooms
        probs = [0.45, 0.40, 0.12, 0.03, 0.0]
    elif s < 80:
        # Standard size → mostly 2 or 3 bedrooms
        probs = [0.15, 0.45, 0.30, 0.08, 0.02]
    elif s < 110:
        # Large units → often 3 or 4 bedrooms
        probs = [0.05, 0.20, 0.40, 0.25, 0.10]
    else:
        # Very large units → 4 or 5 bedrooms dominate
        probs = [0.02, 0.08, 0.25, 0.35, 0.30]

    num_bedrooms.append(np.random.choice([1, 2, 3, 4, 5], p=probs))

num_bedrooms = np.array(num_bedrooms)


# --------------------------------------------
# Generate remaining independent features
# --------------------------------------------

# Distance to city center (in km)
distance_to_center_km = np.random.exponential(scale=3, size=n1)
distance_to_center_km = np.clip(distance_to_center_km, 0.1, 20)
# Exponential distribution: more apartments are located close to the center, fewer are very far away

# Building age (in years)
building_age_years = np.random.gamma(shape=4, scale=10, size=n1)
building_age_years = np.clip(building_age_years, 1, 120)
# Gamma distribution: produces positive ages with a long tail for very old buildings


# Floor number (0 = ground floor)
floor = np.random.randint(0, 15, size=n1)  # floor number from 0 to 14

# Elevator presence mostly depends on floor level
has_elevator = (floor >= 4).astype(int)  # encoded as 0 = no, 1 = yes
# Introduce a small amount of random inconsistencies: a few low floors have elevator, some high floors don't
flip_idx = np.random.choice(np.arange(n1), size=int(0.1 * n1), replace=False)
has_elevator[flip_idx] = 1 - has_elevator[flip_idx]

# Furnished apartments (binary feature: 0 = not furnished, 1 = furnished)
is_furnished = np.random.binomial(1, 0.4, size=n1)

# --------------------------------------------
# Balcony probability depends on floor level
# Higher floors → more likely to have a balcony
# --------------------------------------------
has_balcony = np.zeros(n1, dtype=int)  # initialize with 0 (no balcony)

for i, f in enumerate(floor):
    if f <= 2:
        p = 0.45  # Ground / low floors → balcony less common
    elif f <= 5:
        p = 0.55  # Mid-low floors
    elif f <= 9:
        p = 0.70  # Mid-high floors
    else:
        p = 0.85  # Very high floors → balcony quite common

    # Draw balcony presence with floor-dependent probability (0 = no balcony, 1 = balcony)
    has_balcony[i] = np.random.binomial(1, p)


# --------------------------------------------
# Construct the underlying linear model for monthly_rent (in euros)
# This is the function that a multivariate linear regression model is expected to approximate:
# --------------------------------------------
base_price_per_sqm = 18  # base price per square meter (€/sqm)
rent_from_size = base_price_per_sqm * size_sqm

# Additional linear components affecting the final rent
rent = (
    rent_from_size
    + num_bedrooms * 50                    # more bedrooms typically increase rental value
    - distance_to_center_km * 40           # properties farther from city center tend to be cheaper
    - building_age_years * 2               # older buildings are usually valued slightly less
    + floor * 5                            # higher floors often command a small premium
    + has_elevator * 80                    # elevator presence increases convenience and price
    + has_balcony * 100                    # balcony adds outdoor space, increasing rent
    + is_furnished * 150                   # furnished units usually rent for more
    + neighborhood_income_index * 400      # wealthier neighborhoods generally have higher rents
    - noise_level_index * 200              # noisy areas reduce desirability and price
)

# Add random noise to simulate real-world variability
noise = np.random.normal(loc=0, scale=120, size=n1)
monthly_rent = rent + noise
monthly_rent = np.clip(monthly_rent, 300, None)  # prevent unrealistically low values


# --------------------------------------------
# Build final DataFrame
# "monthly_rent_eur" is the target variable for the regression model (output to predict)
# --------------------------------------------
df_rent = pd.DataFrame({
    "size_sqm": size_sqm.round(1),
    "num_bedrooms": num_bedrooms,
    "distance_to_center_km": distance_to_center_km.round(2),
    "building_age_years": building_age_years.round(0),
    "floor": floor,
    "has_elevator": has_elevator,
    "has_balcony": has_balcony,
    "is_furnished": is_furnished,
    "neighborhood_income_index": neighborhood_income_index.round(3),
    "noise_level_index": noise_level_index.round(3),
    "monthly_rent_eur": monthly_rent.round(0).astype(int)
})


# --------------------------------------------
# Save to CSV
# --------------------------------------------
df_rent.to_csv("rental_prices.csv", index=False)
print('The DataSet has a shape of :', df_rent.shape)
df_rent.head()