In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
n_records = 1000

# 1. Study Hours (2 to 10 hours, normally distributed)
study_hours = np.random.normal(loc=5, scale=1.5, size=n_records)
study_hours = np.clip(study_hours, 2, 10).round(1)  # Limit range and round

# 2. Exam Scores (50 to 100, correlated with study hours + noise)
exam_scores = 50 + (study_hours * 5) + np.random.normal(0, 8, n_records)
exam_scores = np.clip(exam_scores, 50, 100).astype(int)

# 3. House Area (800 to 2500 sq ft, skewed right)
house_area = np.random.gamma(shape=2, scale=500, size=n_records).astype(int)
house_area = np.clip(house_area, 800, 2500)

# 4. House Prices ($150K to $600K, correlated with area + noise)
house_prices = 100_000 + (house_area * 200) + np.random.normal(0, 30_000, n_records)
house_prices = np.clip(house_prices, 150_000, 600_000).astype(int)

# Create DataFrame
data = pd.DataFrame({
    'Study_Hours': study_hours,
    'Exam_Score': exam_scores,
    'House_Area_sqft': house_area,
    'House_Price_USD': house_prices
})

# Save to CSV
data.to_csv('exam_house_dataset.csv', index=False)
print("Dataset generated and saved as 'exam_house_dataset.csv'")

Dataset generated and saved as 'exam_house_dataset.csv'
