In [None]:
import pandas as pd
import numpy as np

max_rows = 1000000  
np.random.seed(42)

# --- Helper function to generate plausible apartment area ---
def get_area_based_on_rooms(total_rooms):
    avg_area_per_room = {
        1: (30, 5),  # mean, std (m²)
        2: (43, 8),
        3: (57, 10),
        4: (69, 12),
        5: (80, 15)
    }
    if total_rooms in avg_area_per_room:
        mean, std = avg_area_per_room[total_rooms]
    else:
        mean = 80 + (total_rooms - 5) * 13
        std = 15 + (total_rooms - 5) * 3
    area = np.random.normal(mean, std)
    return max(round(area, 1), max(10, mean - 3 * std))

# --- Data generation ---

ages = np.random.choice(range(18, 75), max_rows)
adults = np.random.choice([1,2], max_rows, p=[0.5,0.5])
children = np.random.poisson(1, max_rows)
is_student = np.random.choice([True, False], max_rows, p=[0.25, 0.75])
total_rooms = adults + children // 2 + np.random.choice([1,2], max_rows)

# Now, generate NUMERIC area for each sample based on room count
areas = np.array([get_area_based_on_rooms(r) for r in total_rooms])

# Rents determined by area and rooms (you can refine this formula)
rents = np.random.normal(
    7 * areas + (total_rooms - 2) * 75,
    50
).round().astype(int)

# Filtering for validity
valid_idx = (
    (rents > 0)
    & (adults >= 1)
    & (total_rooms >= (adults + children))
    & (areas >= 10)  # plausible min area
)

# Trim arrays
ages = ages[valid_idx]
adults = adults[valid_idx]
children = children[valid_idx]
is_student = is_student[valid_idx]
total_rooms = total_rooms[valid_idx]
areas = areas[valid_idx]
rents = rents[valid_idx]

# set children count to 0 for age less than 21.
children[ages <= 21] = 0

# Truncate to sample size/excel limit
sample_size = min(len(ages), len(adults), len(total_rooms), len(areas), len(rents),len(is_student), max_rows)
tenancy_distance = np.clip(np.random.normal(5, 4, sample_size), 0.5, None)
distance_to_university = np.full(sample_size, np.nan)

student_mask = is_student[:sample_size]
distance_to_university[student_mask] = np.clip(
    np.random.normal(2, 1, student_mask.sum()), 0.1, None
)
# Amenity distances, now not tied to area "type" anymore—could make these depend on area size if you want!
hospital_distance = np.clip(np.random.normal(6, 4, sample_size), 1.5, None)
gym_distance = np.clip(np.random.normal(2, 4, sample_size), 0.5, None)
school_distance = np.clip(np.random.normal(6, 4, sample_size), 2, None)
supermarket_distance = np.clip(np.random.normal(2, 2, sample_size), 0.5, None)

df = pd.DataFrame({
    'Age': ages[:sample_size],
    'Adults': adults[:sample_size],
    'Children': children[:sample_size],
    'Rent': rents[:sample_size],
    'IsStudent': is_student[:sample_size],
    'Distance_to_New_Tenancy': tenancy_distance.round(2),
    'Total_Rooms': total_rooms[:sample_size],
    'Area_m2': areas[:sample_size],  # Now area in square meters
    'Hospital_distance': hospital_distance.round(2),
    'Gym_distance': gym_distance.round(2),
    'School_distance': school_distance.round(2),
    'Supermarket_distance': supermarket_distance.round(2),
    'Distance_to_University': np.round(distance_to_university, 2)
})

# Adjust label logic if you want area to influence desirability
# Masks
is_student_mask = df['IsStudent'] == True

mask_age_over_45 = (df['Age'] > 45) & ~is_student_mask
mask_age_mid = (df['Age'] >= 25) & (df['Age'] <= 45) & ~is_student_mask
mask_age_under_25 = (df['Age'] < 25) & ~is_student_mask

label = np.zeros(len(df))

# ------------------------------
# 1. Students - University priority, no school distance
label[is_student_mask] = (
    1
    - 0.08 * df.loc[is_student_mask, 'Distance_to_University'].fillna(0)  # highest weight
    - 0.05 * df.loc[is_student_mask, 'Hospital_distance']
    - 0.04 * df.loc[is_student_mask, 'Supermarket_distance']
    - 0.03 * df.loc[is_student_mask, 'Gym_distance']
    - 0.03 * df.loc[is_student_mask, 'Rent'] / 1000
    + 0.02 * df.loc[is_student_mask, 'Total_Rooms']
    + 0.003 * df.loc[is_student_mask, 'Area_m2']
)

# ------------------------------
# Helper function for non-students with children logic
def calc_non_student_label(mask, hospital_w, supermarket_w, tenancy_w, gym_w, school_w_children):
    base_score = (
        1
        - hospital_w * df.loc[mask, 'Hospital_distance']
        - supermarket_w * df.loc[mask, 'Supermarket_distance']
    )
    # If children > 0 → add school_distance factor after supermarket
    has_children = df.loc[mask, 'Children'] > 0
    # Initialize penalty array
    school_penalty = np.zeros(has_children.shape[0])
    school_penalty[has_children] = school_w_children * df.loc[mask, 'School_distance'][has_children]
    base_score -= school_penalty

    # Tenancy distance always comes after supermarket (and school if applicable)
    base_score -= tenancy_w * df.loc[mask, 'Distance_to_New_Tenancy']

    # Rest factors
    base_score -= gym_w * df.loc[mask, 'Gym_distance']
    base_score -= 0.03 * df.loc[mask, 'Rent'] / 1000
    base_score += 0.02 * df.loc[mask, 'Total_Rooms']
    base_score += 0.003 * df.loc[mask, 'Area_m2']

    return base_score

# ------------------------------
# 2. Age > 45
label[mask_age_over_45] = calc_non_student_label(
    mask_age_over_45,
    hospital_w=0.07,
    supermarket_w=0.05,
    tenancy_w=0.04,   # assign some weight to tenancy distance
    gym_w=0.03,
    school_w_children=0.04  # if has kids, penalize this much
)

# 3. Age between 25 and 45
label[mask_age_mid] = calc_non_student_label(
    mask_age_mid,
    hospital_w=0.07,
    supermarket_w=0.07,
    tenancy_w=0.05,
    gym_w=0.05,
    school_w_children=0.06
)

# 4. Age < 25
label[mask_age_under_25] = calc_non_student_label(
    mask_age_under_25,
    hospital_w=0.05,
    supermarket_w=0.07,
    tenancy_w=0.04,
    gym_w=0.05,
    school_w_children=0.04
)

# ------------------------------
# Finalize
df['Label'] = np.clip(label, 0, 1).round(2)
# df['Label'] = label.round(2)

df.to_excel('synthetic_housing_data.xlsx', index=False)
