In [None]:
import pandas as pd

df = pd.read_csv("real_estate_data.csv")
df.head()

Unnamed: 0,price,sqft,num_bedrooms,num_bathrooms,zipcode,year_built,has_pool
0,250000,1400,3,2,21044,1990,no
1,340000,1600,4,2,21045,2001,yes
2,500000,2100,4,3,21046,2015,yes
3,420000,1800,3,2,21045,1999,no
4,620000,2500,5,4,21044,2020,yes


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df.hist(bins=30, figsize=(12,8))
plt.tight_layout()
plt.show()

In [None]:
corr_matrix = df.corr(numeric_only=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlations")
plt.show()

In [None]:
df["house_age"] = 2025 - df["year_built"]

In [None]:
df.drop("year_built", axis=1, inplace=True)

NameError: name 'df' is not defined

In [None]:
df["has_pool"] = df["has_pool"].map({"yes": 1, "no":0})

In [None]:
df = pd.get_dummies(df, columns=["zipcode"], drop_first=True)

In [None]:
df.info()
df.head()

In [None]:
X = df.drop("price", axis=1)
y = df["price"].copy()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

pipeline = Pipeline ([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_prepared = pipeline.fit_transform(X)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_prepared, y)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_pred = lin_reg.predict(X_prepared)
lin_mse = mean_squared_error(y, y_pred)
lin_rmse = np.sqrt(lin_mse)

print("Linear Regression RMSE:", lin_rmse)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y, y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title
("Actual vs. Predicted Housing Prices")
plt.grid(True)
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(lin_reg, X_prepared, y,
                         scoring="neg_mean_squared_error", cv=3)

rmse_scores = np.sqrt(-scores)

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_new = 25
new_data = {
    "price": np.random.randint(200000, 800000, size=n_new),
    "sqft": np.random.randint(1000, 3000, size=n_new),
    "num_bedrooms": np.random.randint(2, 6, size=n_new),
    "num_bathrooms": np.random.randint(1, 4, size=n_new),
    "zipcode": np.random.choice(["21044", "21045", "21046"], size=n_new),
    "year_built": np.random.randint(1980, 2025, size=n_new)
}

df_fake = pd.DataFrame(new_data)

df = pd.concat([df, df_fake], ignore_index=True)

print("New dataset shape:", df.shape)
df.head()

NameError: name 'df' is not defined

In [None]:
df["house_age"] = 2025 - df["year_built"]
df.drop("year_built", axis=1, inplace=True)
df["has_pool"] = df["has_pool"].map({"yes": 1, "no":0})
df = pd.get_dummies(df, columns=["zipcode"], drop_first=True)

In [None]:
X.nunique()

X.isnull().sum()

In [None]:
X = X.loc[:, X.nunique() > 1]

In [None]:
X_prepared = pipeline.fit_transform(X)

In [None]:
X_prepared[:5]

In [None]:
X_prepared.shape

In [None]:
scores = cross_val_score(lin_reg, X_prepared, y,
                         scoring="neg_mean_squared_error", cv=5)
rmse_scores = np.sqrt(-scores)

print("Cross-Validation RMSE Scores:", rmse_scores)
print("Mean:", rmse_scores.mean())
print("Standard Deviation:", rmse_scores.std())

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

tree_reg = DecisionTreeRegressor(random_state=42)

tree_scores = cross_val_score(tree_reg, X_prepared, y,
                              scoring="neg_mean_squared_error", cv=5)

tree_rmse = np.sqrt(-tree_scores)

print("Decision Tree RMSE Scores:", tree_rmse)
print("Mean:", tree_rmse.mean())
print("Standard Deviation:", tree_rmse.std())

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

forest_reg = RandomForestRegressor(random_state=42)

forest_scores = cross_val_score(forest_reg, X_prepared, y,
                                scoring="neg_mean_squared_error", cv=5)

forest_rmse = np.sqrt(-forest_scores)

print("Random Forest RMSE Scores:", forest_rmse)
print("Mean:", forest_rmse.mean())
print("Standard Deviation:", forest_rmse.std())

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

#Defining Grid
param_grid = {
    "n_estimators": [30, 50, 100],
    "max_features": [4, 6, 8],
}

forest_red = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    forest_reg, param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
return_train_score=True
)
grid_search.fit(X_prepared, y)

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))