In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.5-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.0 MB 5.4 MB/s eta 0:00:02
   ----- ---------------------------------- 1.6/11.0 MB 3.5 MB/s eta 0:00:03
   -------- ------------------------------- 2.4/11.0 MB 3.9 MB/s eta 0:00:03
   ---------- ----------------------------- 2.9/11.0 MB 3.4 MB/s eta 0:00:03
   ------------- -------------------------- 3.7/11.0 MB 3.5 MB/s eta 0:00:03
   ---------------- ----------------------- 4.5/11.0 MB 3.5 MB/s eta 0:00:02
  

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
from sklearn.datasets import fetch_california_housing

# Load the dataset
housing = fetch_california_housing()

housing.keys()

In [None]:
print(housing.DESCR)

In [None]:
print(housing.target)

In [None]:
#preparing Datasets

dataset =pd.DataFrame(housing.data,columns = housing.feature_names)
dataset["price"] = housing.target
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
##check the missing values
dataset.isnull().sum()

In [None]:
##EDA Exploratory data analysis

##correlation
dataset.corr()

In [None]:
import seaborn as sns
# sns.pairplot(dataset)
sns.regplot(x="HouseAge",y="price",data=dataset)


In [None]:
#independent and dependedent features
X = dataset.iloc[:,:-1] #independent

y = dataset.iloc[:,-1] # dependent

In [None]:
X.head()

In [None]:
#Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)


In [None]:
##standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#model training
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [None]:
print(regressor.coef_)
print(regressor.intercept_)

In [None]:
pred = regressor.predict(X_test)

In [None]:
plt.scatter(pred,y_test)
plt.xlabel("Predicted Price")
plt.ylabel("Actual Price")
m, b = np.polyfit(pred, y_test, 1)
print("Slope:", m)
print("Intercept:", b)
plt.plot(pred, m*pred + b, color='red', label='Regression Line')
plt.legend()
plt.show()

In [None]:
residuals = y_test - pred

In [None]:
#plotting residuals
sns.displot(residuals,kind="kde")


In [None]:
#scatter plot with respect to prediction and residuals
plt.scatter(pred,residuals)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")


In [None]:
#preformance
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
print(mean_absolute_error(y_test,pred))
print(mean_squared_error(y_test,pred))
n = X_test.shape[0]
p = X_test.shape[1]
# print(np.sqrt(mean_squared_error(y_test,pred)))
r2=r2_score(y_test,pred)
print(r2)
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(adj_r2)

In [None]:
# new data prediction
housing.data[0].reshape(1,-1).shape

In [None]:
n_data= scaler.transform(housing.data[0].reshape(1,-1))

In [None]:
regressor.predict(n_data)

In [None]:
#pickling the model file for deployment
import pickle
pickle.dump(regressor,open("regmodel.pkl","wb"))

In [None]:
# loading the pretrained model using pickle
pickled_model = pickle.load(open("regmodel.pkl","rb"))
pickled_model.predict(n_data)