In [4]:
import pandas as pd
import numpy as np

# 1. Load data
df = pd.read_csv("house_sales.csv")

# Calculate the number of missing values in the 'city' column and save to missing_city
missing_city = (df['city'].isna() | (df['city'] == '--')).sum()

# Output the result
print(missing_city)


# 2. Clean data
df['house_id'] = df['house_id'].astype('category')

# City: Replace missing/invalid with 'Unknown'
df['city'] = df['city'].str.replace('--', 'Unknown')
df['city'] = df['city'].astype('category')

# Sale Price: Remove missing entries
df = df.dropna(subset=['sale_price'])
df['sale_price'] = df['sale_price'].astype('int')

# Sale Date: Replace missing with 2023-01-01
df['sale_date'] = df['sale_date'].fillna('2023-01-01')

# Months Listed: Replace missing with mean, rounded to 1 decimal place
mean_months = round(df['months_listed'].mean(), 1)
df['months_listed'] = df['months_listed'].fillna(mean_months)
df['months_listed'] = df['months_listed'].astype('float')

# Bedrooms: Replace missing with mean, rounded to nearest integer
mean_bedrooms = round(df['bedrooms'].mean())
df['bedrooms'] = df['bedrooms'].fillna(mean_bedrooms).astype(int)

# House Type: Clean variations and replace missing with most common
the_map = {'Det.': 'Detached', 'Semi':'Semi-detached', 'Terr.':'Terraced' }
df['house_type'] = df['house_type'].replace(the_map)

# Filter for only allowed types or replace missing/invalid with mode
most_common_type = df['house_type'].mode()[0]
allowed_types = ["Terraced", "Semi-detached", "Detached"]
df.loc[~df['house_type'].isin(allowed_types), 'house_type'] = most_common_type
df['house_type'] = df['house_type'].fillna(most_common_type)

# Area: Remove ' sq.m.' string, convert to float, fill missing with mean
if df['area'].dtype == object:
    df['area'] = df['area'].str.replace(' sq.m.', '', regex=False).astype(float)
df['area'] = df['area'].round(1)
mean_area = round(df['area'].mean(), 1)
df['area'] = df['area'].fillna(mean_area)

clean_data = df

# Preview cleaned data
print(clean_data.head(5))


# 3. Compute Sale price by number of bedrooms
# Group by bedrooms and calculate average price and variance
home = df = pd.read_csv('house_sales.csv')
price_by_rooms = home.groupby('bedrooms').agg(avg_price=('sale_price','mean'), var_price = ('sale_price','var')).round(1).reset_index(drop=False)
print(price_by_rooms)


# 4. Fit a baseline model to predict the sale price of a house.
# Load training and validation data
training_data = pd.read_csv('train.csv')
test_data = pd.read_csv('validation.csv')


from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse

# Preprocessing: Drop non-predictive columns
# house_id and sale_date are identifiers/metadata, not features
training_data = training_data.drop(['sale_date', 'house_id'], axis=1)
test_data = test_data.drop(['sale_date'], axis=1)

# Encoding: Convert categorical variables into dummy/indicator variables
training_data = pd.get_dummies(training_data, drop_first=True, dtype=int)
test_data = pd.get_dummies(test_data, drop_first=True, dtype=int)

# Feature Selection: Separate features (X) from the target (y)
X = training_data.drop(['sale_price'], axis=1).values
y = training_data['sale_price'].values

# Data Splitting: Create training and internal testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

# Feature Scaling: Normalize data so all features are on the same scale
scale = StandardScaler()

# Fit only on training data to prevent data leakage
trans_train_data = scale.fit_transform(X_train)
trans_test_data = scale.transform(X_test)

# Model Training: Initialize and fit the Linear Regression model
linreg = LinearRegression()

# Evaluation: Predict on the internal test set and calculate RMSE
linreg.fit(trans_train_data, y_train)
lr_ypred = linreg.predict(trans_test_data)
rmse = np.sqrt(mse(y_test, lr_ypred))

# Final Prediction: Prepare the validation data for submission
ref = test_data.drop(['house_id'], axis=1).values
test_data_mod = scale.fit_transform(ref)

# Results: Map predictions back to the original House IDs
base_result = test_data[['house_id']].assign(price = linreg.predict(test_data_mod))

# Display final results
print(f"Model RMSE: {rmse}")
print(base_result)



# 5. Fit a comparison model to predict the sale price of a house.
# Model Initialization
# Initialize the Random Forest Regressor with a fixed random state for reproducibility
rfr = RandomForestRegressor(random_state = 9)

# Hyperparameter Grid Definition
# Define the search space for the model to find the best combination of settings
param_rfs = {
    'max_depth':[6, 7, 8],
    'n_estimators': [100, 200],
    'min_samples_leaf':[5, 7, 10],
    'min_samples_split': [2, 4, 6, 8]
}



# Grid Search Configuration
# Set up GridSearchCV to cross-validate different combinations
# scoring='neg_root_mean_squared_error' uses RMSE (negative because higher is better in sklearn)
# n_jobs=-1 uses all available CPU cores for faster training
grid_rfr = GridSearchCV(estimator=rfr, param_grid=param_rfs, scoring='neg_root_mean_squared_error', n_jobs=-1)


# Training
# Run the grid search on the scaled training data
grid_rfr.fit(trans_train_data, y_train)

# Model Selection
# Extract the best-performing model found during the search
best_rfr = grid_rfr.best_estimator_

# Evaluation
# Use the optimized model to predict on the internal test set
rfr_y_pred = best_rfr.predict(trans_test_data)

# Calculate the Root Mean Squared Error for the Random Forest model
rfr_rmse = np.sqrt(mse(y_test, rfr_y_pred))

# Final Output Generation
# Predict prices for the validation set using the best Random Forest model
# Note: Ensure test_data_mod was scaled using scale.transform() from earlier
compare_result = test_data[['house_id']].assign(price = best_rfr.predict(test_data_mod))

print(compare_result.head(10))

# Display the performance metric
print(f"Random Forest Best RMSE: {rfr_rmse}")
print(f"Best Parameters: {grid_rfr.best_params_}")

73
  house_id        city  sale_price  ... bedrooms     house_type   area
0  1217792  Silvertown       55943  ...        2  Semi-detached  107.8
1  1900913  Silvertown      384677  ...        5       Detached  498.8
2  1174927   Riverford      281707  ...        6       Detached  542.5
3  1773666  Silvertown      373251  ...        6       Detached  528.4
4  1258487  Silvertown      328885  ...        5       Detached  477.1

[5 rows x 8 columns]
   bedrooms  avg_price     var_price
0         2    67076.4  5.652896e+08
1         3   154665.1  2.378289e+09
2         4   234704.6  1.725211e+09
3         5   301515.9  2.484328e+09
4         6   375741.3  3.924432e+09
Model RMSE: 21835.347652860648
     house_id          price
0     1331375  107393.996770
1     1630115  300982.868433
2     1645745  380724.071038
3     1336775  116321.195539
4     1888274  266780.641522
..        ...            ...
295   1986255  347532.713021
296   1896276  365763.964228
297   1758223  254136.806503
298   