In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# ===============================
# TASK 3 â€“ Housing Price Prediction
# ===============================
""" Housing Price Prediction
 Name : MUHAMMAD DAIL CHOWHAN
 Email : dailchowhan@gmail.com
 Date: 2 January'2026
 Intern ID : ARCH-2610-0765"""

print(8*'-',"Housing Price Prediction",8*'-')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# 2. Load dataset from CSV
data = pd.read_csv('california_housing.csv')

# Display column names to verify
print("Column names:", data.columns.tolist())
print("\nFirst 5 Rows:")
print(data.head())

# 3. Check missing values
print("\nMissing Values:")
print(data.isnull().sum())

# 4. Select features & target
# The target is typically 'MedHouseVal' or 'Price' depending on the dataset
# Based on the CSV, the last column is the target (median house value)
X = data.iloc[:, :-1]  # All columns except last
y = data.iloc[:, -1]   # Last column is target (price)

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Train Linear Regression model
print("\n" + "="*50)
print("LINEAR REGRESSION")
print("="*50)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# 7. Predictions
y_pred_lr = lr_model.predict(X_test)

# 8. Evaluation
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("\nModel Evaluation:")
print(f"Mean Absolute Error: {mae_lr:.4f}")
print(f"R2 Score: {r2_lr:.4f}")

# Feature importance for Linear Regression (coefficients)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': lr_model.coef_
})
print("\nFeature Coefficients:")
print(feature_importance.sort_values('coefficient', ascending=False))

# 9. Train Random Forest model
print("\n" + "="*50)
print("RANDOM FOREST")
print("="*50)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, rf_pred)
r2_rf = r2_score(y_test, rf_pred)

print("\nRandom Forest Results:")
print(f"MAE: {mae_rf:.4f}")
print(f"R2: {r2_rf:.4f}")

# Feature importance for Random Forest
rf_feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
print("\nFeature Importance (Random Forest):")
print(rf_feature_importance.sort_values('importance', ascending=False))

# 10. Compare models
print("\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)
comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest'],
    'MAE': [mae_lr, mae_rf],
    'R2 Score': [r2_lr, r2_rf]
})
print(comparison)

# 11. Optional: Sample predictions vs actual
print("\n" + "="*50)
print("SAMPLE PREDICTIONS (first 10 test samples)")
print("="*50)
sample_results = pd.DataFrame({
    'Actual': y_test[:10].values,
    'LR_Predicted': y_pred_lr[:10],
    'RF_Predicted': rf_pred[:10]
})
sample_results['LR_Error'] = abs(sample_results['Actual'] - sample_results['LR_Predicted'])
sample_results['RF_Error'] = abs(sample_results['Actual'] - sample_results['RF_Predicted'])
print(sample_results.round(4))

-------- Housing Price Prediction --------
Column names: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

First 5 Rows:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

Missing Values:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

Features shape: (20640, 7)
Target shape: (20640,)

LINEAR REGRESSION

Model Evaluation:
Mean Absolute Error: 0.4815
R2 Scor