In [0]:
# %sql
# ALTER TABLE sdd_dev.sohag_test.housing 
# ALTER COLUMN unique_id SET NOT NULL;

# ALTER TABLE sdd_dev.sohag_test.housing 
# ADD CONSTRAINT housing_pk PRIMARY KEY (unique_id);

In [0]:
from math import sqrt
import mlflow.sklearn
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import *

In [0]:
from pyspark.sql import functions as F
from databricks.feature_engineering import FeatureEngineeringClient
fe = FeatureEngineeringClient()

table_name = "sdd_dev.sohag_test.housing"
feature_data_pd = fe.read_table(name=table_name).toPandas()
feature_data_pd.head()


In [0]:
feature_data_pd.ocean_proximity.unique()

In [0]:
ocen_proximity_mapping = {
    'NEAR BAY': 1,
    '<1H OCEAN': 2,
    'INLAND': 3,
    'NEAR OCEAN': 4,
    'ISLAND': 5
}

# Replace values in the Dataframe
feature_data_pd['ocean_proximity'] = feature_data_pd['ocean_proximity'].replace(ocen_proximity_mapping).astype(float)

# print the updated dataframe
feature_data_pd = feature_data_pd.fillna(0)
display(feature_data_pd)

In [0]:
from sklearn.model_selection import train_test_split

print(f"We have {len(feature_data_pd)} rows in the dataset")

# split target variable into it's on dataset
target_col = "median_house_value"

X_all = feature_data_pd.drop(target_col, axis=1)
y_all = feature_data_pd[target_col]

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

print(f"We have {len(X_train)} rows in the training set and {len(X_test)} rows in the test set")

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.concat([X_train, y_train], axis=1)
corr = data.corr()
pd.set_option("display.max_columns", 10)
display(corr)

In [0]:
# display correlation matrix visually
import seaborn as sns

plt.figure(figsize=(12, 10))
sns.heatmap(feature_data_pd.corr(), annot=True, cmap='coolwarm')
plt.show()

In [0]:

# Turn on autologging
mlflow.sklearn.autolog(log_input_examples=True)

std_ct = ColumnTransformer(
    transformers =   [
        ("scaler", StandardScaler(), ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income", "ocean_proximity"])
    ]
)

# Pipeline to transform inputs and then pass result to the liner regression model
lr_pipeline = Pipeline(
    steps=[
        ("preprocessor", std_ct),
        ("regressor", LinearRegression())
    ]
)

In [0]:
# Fit our model 
lr_mdl = lr_pipeline.fit(X_train, y_train)

# Evaluate the test set
predicted = lr_mdl.predict(X_test)

In [0]:
print("Test Evaluation summeries")
test_r2 = r2_score(y_test, predicted)
test_rmse = sqrt(mean_squared_error(y_test, predicted))
test_mse = mean_squared_error(y_test, predicted)
test_mape = mean_absolute_percentage_error(y_test, predicted)

print(f"R2 Score: {test_r2}")
print(f"RMSE: {test_rmse}")
print(f"MSE: {test_mse}")
print(f"MAPE: {test_mape}")

# Examine Model Result
* We will examine intercept and the coefficient of the fitted model.¨
* Perform t-test on each coffecient to assess it's significance in contributing to the overall model.

In [0]:
lr_mdl

In [0]:
from scipy import stats

# Extracting coefficients and intercept
coefficients = lr_pipeline.named_steps['regressor'].coef_
intercept = lr_pipeline.named_steps['regressor'].intercept_

In [0]:
print(coefficients)

In [0]:
print(intercept)