In [12]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/train.csv")


In [17]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [18]:
allowed = [
    "OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars",
    "BedroomAbvGr", "FullBath", "YearBuilt", "Neighborhood", "SalePrice"
]

corr = df[allowed].copy()

corr["Neighborhood"] = corr["Neighborhood"].astype("category").cat.codes

corr_matrix = corr.corr()["SalePrice"].sort_values(ascending=False)
print(corr_matrix)


SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
TotalBsmtSF     0.613581
FullBath        0.560664
YearBuilt       0.522897
Neighborhood    0.210851
BedroomAbvGr    0.168213
Name: SalePrice, dtype: float64


In [19]:
features = [
    "OverallQual",
    "GrLivArea",
    "GarageCars",
    "TotalBsmtSF",
    "FullBath",
    "YearBuilt"
]

X = df[features].copy()
y = df["SalePrice"]


In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   OverallQual  1460 non-null   int64
 1   GrLivArea    1460 non-null   int64
 2   GarageCars   1460 non-null   int64
 3   TotalBsmtSF  1460 non-null   int64
 4   FullBath     1460 non-null   int64
 5   YearBuilt    1460 non-null   int64
dtypes: int64(6)
memory usage: 68.6 KB


In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [23]:
def evaluate_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(name)
    print(f"MAE : {mae:.2f}")
    print(f"MSE : {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²  : {r2:.4f}")
    print("-" * 30)


In [24]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

evaluate_model("Linear Regression", y_test, y_pred_lr)


Linear Regression
MAE : 25319.86
MSE : 1576962754.88
RMSE: 39710.99
R²  : 0.7944
------------------------------


In [25]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svr = SVR(kernel='rbf')
svr.fit(X_train_scaled, y_train)

y_pred_svr = svr.predict(X_test_scaled)

evaluate_model("SVR", y_test, y_pred_svr)


SVR
MAE : 59432.25
MSE : 7845528043.75
RMSE: 88574.99
R²  : -0.0228
------------------------------


In [26]:
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

evaluate_model("Random Forest", y_test, y_pred_rf)


Random Forest
MAE : 19102.90
MSE : 852145444.03
RMSE: 29191.53
R²  : 0.8889
------------------------------


In [28]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf, "model_random_forest.pkl")


['model_random_forest.pkl']

In [29]:
# Load the saved Random Forest model
rf_loaded = joblib.load("model_random_forest.pkl")

# Example: test on the first 5 rows of X_test
sample_X = X_test[:5]  # or X_test_scaled if you scaled (not needed for RF)
predicted_prices = rf_loaded.predict(sample_X)

print("Predicted House Prices:", predicted_prices)

# You can also generate some random sample features to test
# Random example: [OverallQual, GrLivArea, GarageCars, TotalBsmtSF, FullBath, YearBuilt]
random_sample = np.array([[7, 2000, 2, 1000, 2, 2005],
                          [8, 2500, 3, 1200, 3, 2010],
                          [6, 1500, 1, 800, 2, 1995]])

predicted_random = rf_loaded.predict(random_sample)
print("Predicted Prices for Random Samples:", predicted_random)

Predicted House Prices: [140833.75 321682.98 113216.   171102.75 304759.58]
Predicted Prices for Random Samples: [250206.425 332248.3   157610.75 ]




In [30]:
import joblib

# Load the saved model
rf_loaded = joblib.load("model_random_forest.pkl")

