In [1]:
# Manipulating dataframes
import pandas as pd
import numpy as np

# For reproducibility
import random
np.random.seed(493)

In [2]:
# Number of samples
n_samples = 537

# Generate features
square_feet = np.random.normal(loc=850, scale=200, size=n_samples).clip(400, 2000)
bedrooms = np.random.choice([1, 2, 3, 4], size=n_samples, p=[0.3, 0.4, 0.2, 0.1])
bathrooms = bedrooms - np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3])
bathrooms = np.clip(bathrooms, 1, None)
age = np.random.randint(0, 50, size=n_samples)
floor = np.random.randint(1, 10, size=n_samples)
downtown_distance = np.abs(np.random.normal(loc=5, scale=2, size=n_samples))
has_parking = np.random.choice([0, 1], size=n_samples, p=[0.3, 0.7])
log_sqft = np.log(square_feet)

# Realistic rent calculation
rent = (
    900
    + 300 * bedrooms
    + 200 * bathrooms
    + 250 * log_sqft
    - 8 * age
    - 120 * downtown_distance
    + 25 * floor
    + 150 * has_parking
    + np.random.normal(0, 150, size=n_samples)
)

# Add heteroskedastic noise
heteroskedastic_noise = np.random.normal(0, 0.001 * square_feet ** 1.5)
rent += heteroskedastic_noise

# Assemble DataFrame
df_fakeville = pd.DataFrame({
    'SquareFeet': square_feet.astype(int),
    'Bedrooms': bedrooms,
    'Bathrooms': bathrooms,
    'Age': age,
    'Floor': floor,
    'DistanceToDowntown': downtown_distance.round(2),
    'HasParking': has_parking,
    'Rent': rent.round(2)
})

# Create a copy to introduce nulls and duplicates
df_dirty = df_fakeville.copy()

# Introduce random NaNs into about 5% of the data per selected column
for col in ['SquareFeet', 'Bedrooms', 'Bathrooms', 'Age', 'DistanceToDowntown']:
    n_missing = int(0.05 * len(df_dirty))
    missing_indices = np.random.choice(df_dirty.index, size=n_missing, replace=False)
    df_dirty.loc[missing_indices, col] = np.nan

# Introduce random duplicate rows (about 3% of dataset)
n_duplicates = int(0.03 * len(df_dirty))
duplicate_rows = df_dirty.sample(n=n_duplicates, replace=True)
df_dirty = pd.concat([df_dirty, duplicate_rows], ignore_index=True)

df_fakeville.head()
df_fakeville.shape

(537, 8)

In [3]:
# Save to CSV
df_fakeville.to_csv('../data/fakeville_rent.csv', index=False)