#INSTALLING THE LIBRARIES REQUIRED
Here we are installing the libraries that we require for this  project

In [None]:
!pip install pandas scikit-learn matplotlib seaborn



#IMPORTING ALL THE LIBRARIES
Here we are importing the libraries that we just installed above

In [None]:
#IMPORTING THE LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle

#LOADING THE DATASETS
Here we are loading our california housing dataset into how system.
And then we show our fve rows and columns

In [None]:
#LOADING THE DATASET
california = fetch_california_housing()

x = california.data
y = california.target

In [None]:
df = pd.DataFrame(x, columns=california.feature_names )
df["MedHouseVal(PRICE)"] = y


df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal(PRICE)
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


#INFORMATIONS ABOUT OUR DATASETS

In [None]:
#INFORMATION ABOUT DATA
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MedInc              20640 non-null  float64
 1   HouseAge            20640 non-null  float64
 2   AveRooms            20640 non-null  float64
 3   AveBedrms           20640 non-null  float64
 4   Population          20640 non-null  float64
 5   AveOccup            20640 non-null  float64
 6   Latitude            20640 non-null  float64
 7   Longitude           20640 non-null  float64
 8   MedHouseVal(PRICE)  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [None]:
#DESCRIBE ABOUT DATA
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal(PRICE)
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [None]:
#CHECKING FOR MISSING VALUES
df.isnull().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal(PRICE),0


HERE
- WE GOT INFORMATION ABOUT OUR DATASET USING df.info()
- THEN WE DESCRIBED OUR DATASET using df.describe() WHICH GAVE US THE SUMMARY STATISTICS OF OUR DATA LIKE THE MEAN, MEDIAN, MIN, MAX, STANDARD DEVIATION
- WE ALSO CHECKED FOR MISSING VALUES USING ff.isnull().sum()

#DEFINING OUR INPUTS AND OUTPUT



In [None]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal(PRICE)'],
      dtype='object')

In [None]:
x = df.drop("MedHouseVal(PRICE)", axis=1)
y = df["MedHouseVal(PRICE)"]


HERE WE DEFINED OUR INPUTS AND OUTPUTS
- INPUTS(x) - EVERYTHING USED TO MAKE THE PREDICTION
- OUTPUTS(y) - EVERYTHING WE WANT TO PREDICT

FOR OUR INPUTS, WE HAD
- MedInc: Median income of people living in that area
- HouseAge: Median age of the houses in that area
- AveRooms: Average number of rooms per house in that block
- Population: Number of people living in that area
- AveOccup: Average number of people living in each house
- Latitude and Longitude is for the location

FOR THE OUTPUT, WE HAD
- MedHouseVal(PRICE): Median House Value, It is the price that we want to predict



#TRAINING AND TESTING DATA: SPLITTING DATA INTO TRAINING AND TESTING

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,  random_state=42)

In [None]:
x_train.shape, x_test.shape

((16512, 8), (4128, 8))

In [None]:
print(x_train)
print(y_train)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
14196  3.2596      33.0  5.017657   1.006421      2300.0  3.691814     32.71   
8267   3.8125      49.0  4.473545   1.041005      1314.0  1.738095     33.77   
17445  4.1563       4.0  5.645833   0.985119       915.0  2.723214     34.66   
14265  1.9425      36.0  4.002817   1.033803      1418.0  3.994366     32.69   
2271   3.5542      43.0  6.268421   1.134211       874.0  2.300000     36.78   
...       ...       ...       ...        ...         ...       ...       ...   
11284  6.3700      35.0  6.129032   0.926267       658.0  3.032258     33.78   
11964  3.0500      33.0  6.868597   1.269488      1753.0  3.904232     34.02   
5390   2.9344      36.0  3.986717   1.079696      1756.0  3.332068     34.03   
860    5.7192      15.0  6.395349   1.067979      1777.0  3.178891     37.58   
15795  2.5755      52.0  3.402576   1.058776      2619.0  2.108696     37.77   

       Longitude  
14196    -117.03  
8

##HERE WE ARE TRAINING THE DATA AND ALSO TESTING IT
### 1. TRAINING SET(x_train, y_train):
we pick 80% of the data and we use it to teach our model
### 2. TESTING SET(x_test, y_test):
we pick 20% of the data and we use it to see how the data performed

##x_train.shape: it shows the number of rows and columns, it shows how the split works

#SCALING THE FEATURES OF OUR DATASETS


In [None]:
scaler = StandardScaler()

#firtting the data on the training data and transforming it
x_train_scaled = scaler.fit_transform(x_train)

#transforming the test data using the same scaler
x_test_scaled = scaler.transform(x_test)

## HERE WE SCALE THE FEATURES OF OUR DATASETS
That means that we want to make all the features be on the same scale i.e
mean = 0, standard deviation = 1

### HERE WE NOTICE THAT FIT WAS USED ONLY ON THE TRAINING DATAONLY AND NOT TEST DATA
THIS IS BECAUSE WE NEED ALL THE DATA THAT WE ARE TRAINING TO FIT THE SAME SCALE AND THEN APPLY IT. WE NEVER FIT ON TEST DATA BECAUSE IT WOULD LEAK INFORMATION AND IT MEANS
- THAT THE MODEL WILL GET THE INFORMATION BEFORE EVALUATION
- IT MAKES THE PERFORMANCE LOOK BETTER THAN IT ACTUALLY IS
TEST DATA MUST REPRESENT NEW, UNSEEN DATA

#TRAINING THE MODEL

In [None]:
linear_model = LinearRegression()
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=0.1)

linear_model.fit(x_train_scaled, y_train)
ridge_model.fit(x_train_scaled, y_train)
lasso_model.fit(x_train_scaled, y_train)

### HERE WE ARE TRAINING OUR MODEL AND WE ARE GOING TO TRAIN IT USING 3 MODELS
- LINEAR REGRESSION
- RIDGE
- LASSO
#### - LINEAR REGRESSION
THE BASIC MODEL, NO REGULARIZATION AND IT JUST FITS A STRAIGHT LINE

#### - RIDGE
IT ADDS L2 REGULARIZATION TO PREVENT OVERFITTING AND IT ALSO ENHANCES STABILITY

#### - LASSO
IT ADDS L1 REGULARIZATION TO REMOVE UNNECESSARY FEATURES

##   WE HAD .fit()
THIS IS WHERE THE  MODEL LEARN FROM THE TRAINING DATA
- IT STUDIES THE PATTERN IN x_train
- IT LEARNS HOW X RELATES WITH Y
- IT BUILDS AN INTERNAL FORMULA TO PREDICTE THE PRICES

#EVALUATING THE THREE MODELS(LINEAR, RIDGE, LASSO MODELS) RMSE, MAE AND R2

In [None]:
models = {
    "Linear Regression": linear_model,
    "Ridge Regression": ridge_model,
    "Lasso Regression": lasso_model
}

for name, model in models.items():
  y_pred = model.predict(x_test_scaled)

  mse = mean_squared_error(y_test, y_pred)
  rmse = mse ** 0.5
  mae = mean_absolute_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  print("{}".format(name))
  print("RMSE: ", round(rmse, 4))
  print("MAE: ", round(mae, 4))
  print("R2: ", round(r2, 4))
  print()

Linear Regression
RMSE:  0.7456
MAE:  0.5332
R2:  0.5758

Ridge Regression
RMSE:  0.7456
MAE:  0.5332
R2:  0.5758

Lasso Regression
RMSE:  0.8244
MAE:  0.6222
R2:  0.4814



### HERE WE EVALUATED OUR THREE MODELS TO KNOW THE ONRE THAT IS BEST FOR OUR PREDICTION APP
##### THE CODE ABOVE LOOPS THROUTH THE THREE MODELS AMD CALCULATES
- RMSE - ROOT MEAN SQUARE ERROR: HOW FAR THE PREDICTIONS ARE FROM THE TRUE PRICE
- MAE - MEAN ABSOLUTE ERROR: AVERAGE ERROR PER PREDICTION
- R2 SCORE: HOW WELL THE MODEL EXPLAINS VARIANCE IN THE DATA....
1.0 - PERFECT, 0 - TERRIBLE, NEGATIVE- VERY BAD ...

#### AFTER TAHT WE PICKED OUR BEST MODEL
THE BEST MODEL IS THE MODEL WITH THE HIGHEST R2 AND THE LOWEST RMSE


#SAVING THE BEST MODEL
THE BEST MODEL FROM OUR EVALUATION ABOVE IS THE RIDGE REGRESSION

In [None]:
best_model = ridge_model

with open("scaler.pkl", "wb") as f:
  pickle.dump(scaler, f)

with open("model.pkl", "wb") as f:
  pickle.dump(best_model, f)

  print("MODEL AND SCALER HAS BEEN SAVED SUCCESSFULLY")

MODEL AND SCALER HAS BEEN SAVED SUCCESSFULLY


### FROM THE PREVIOUS STEP, WE FIGURED THAT OUR BEST MODEL WAS RIDGE MODEL. SO THAT IS WHAT WE USE
#### HERE WE CREATE A FILE OF OUR SCALER AND ALSO A FILE THAT HAS OUR BEST MODEL. IT HAS BEEN SAVED IN THE FOLDER OF OUR COLAB FILE.
THIS IS WHAT WE WOULD USE TO CREATE OUR WEB APP