# Task 2 – Predictive Analysis Using Machine Learning (Colab Ready)
This notebook builds a regression model to predict house prices using synthetic data.
- Feature selection
- Train/test split
- Linear Regression & Random Forest
- Model evaluation (RMSE, R²)


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

np.random.seed(42)

n_samples = 1000
area = np.random.randint(500, 4000, size=n_samples)
bedrooms = np.random.randint(1, 6, size=n_samples)
age = np.random.randint(0, 30, size=n_samples)
distance_city_center = np.random.uniform(1, 25, size=n_samples)

base_price = 2000 * area + 500000 * bedrooms - 10000 * age - 30000 * distance_city_center
noise = np.random.normal(0, 200000, size=n_samples)
price = base_price + noise

data = pd.DataFrame({
    'area_sqft': area,
    'bedrooms': bedrooms,
    'age_years': age,
    'distance_city_center_km': distance_city_center,
    'price': price
})
data.head()

Unnamed: 0,area_sqft,bedrooms,age_years,distance_city_center_km,price
0,3674,2,4,21.159136,7543262.0
1,1360,1,5,20.984331,2567262.0
2,1794,2,12,1.264759,4703425.0
3,1630,1,8,18.989092,2941471.0
4,1595,4,29,19.208233,4207331.0


In [2]:
data.describe()

Unnamed: 0,area_sqft,bedrooms,age_years,distance_city_center_km,price
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2303.765,2.979,14.189,13.199527,5565893.0
std,1020.993533,1.414411,8.661657,7.016923,2242042.0
min,501.0,1.0,0.0,1.004522,828958.1
25%,1428.0,2.0,7.0,7.082623,3743904.0
50%,2258.5,3.0,14.0,13.61497,5452582.0
75%,3274.75,4.0,21.25,19.261908,7390137.0
max,3999.0,5.0,29.0,24.993131,10368740.0


In [3]:
feature_cols = ['area_sqft', 'bedrooms', 'age_years', 'distance_city_center_km']
X = data[feature_cols]
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((800, 4), (200, 4))

In [5]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lr = lin_reg.predict(X_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print('Linear Regression RMSE:', rmse_lr)
print('Linear Regression R²:', r2_lr)


Linear Regression RMSE: 187080.65551447016
Linear Regression R²: 0.993419196212908


In [7]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print('Random Forest RMSE:', rmse_rf)
print('Random Forest R²:', r2_rf)

Random Forest RMSE: 257222.57619777767
Random Forest R²: 0.9875594568109195


In [8]:
importances = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
importances

Unnamed: 0,0
area_sqft,0.894446
bedrooms,0.08971
distance_city_center_km,0.011485
age_years,0.004359


## Summary
- ML model built successfully
- Random Forest performs better
- Area & bedrooms most important features

This completes Task 2.