In [0]:
import os 
import pandas  as pd 
import numpy   as np
from matplotlib import pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [0]:
df = pd.read_csv('/dbfs/FileStore/tables/USA_Housing.csv')
df.head()

In [0]:
df["Address"] = df.Address.str.replace("\n", "")
df.head()

In [0]:
df.info()

In [0]:
df["zip_code"] = df.Address.str.extract(r"([0-9]{5}\-{0,1}[0-9]{0,4}$)", expand=True).replace(r"\-[0-9]{0,4}$", "", regex=True)


In [0]:
df.head()


In [0]:
df.info()

In [0]:
df["zip_code"].value_counts

In [0]:
df['zip_code'] = df.zip_code.astype("int")

In [0]:
df.zip_code.value_counts().max()

In [0]:
df.drop( columns = ["zip_code"], inplace=True)

In [0]:
df.drop_duplicates(inplace=True)

In [0]:
df.info()

In [0]:
df.rename( columns = { col: col.lower().strip().replace(" ", "_") for col in df.columns}, inplace=True)

In [0]:
df.columns

In [0]:
sns.histplot(df, x="price")

In [0]:
df.price.skew()

In [0]:
df.describe()

In [0]:
sns.pairplot(df)

In [0]:
column_list = df.columns.to_list()
column_list.remove('price')
column_list.remove('address')
indp_vars = '+'.join(column_list)
indp_vars

In [0]:


inpd_vars = df.columns
model = smf.ols(formula=f'price ~ {indp_vars}', data=df)
result = model.fit()
print(result.summary())


In [0]:
##area_num_rooms appears insignifigant as p_value > 0.05 , removing then retrying 

column_list.remove('avg_area_number_of_bedrooms')
indp_vars = '+'.join(column_list)
model2 = smf.ols(formula=f'price ~ {indp_vars}', data=df)
result2 = model2.fit()
print(result2.summary())


In [0]:
## all independent variables now now appear significant  as all p_values are now close to zero with a high  R-squared value of 0.918. 
## message suggests multicolinearity so we'll check it using the Variance Inflation Fcator 

from statsmodels.tools.tools               import add_constant
from statsmodels.stats.outliers_influence  import variance_inflation_factor as vif

df_vif = df.drop(columns=['price', 'address', 'avg_area_number_of_bedrooms'])
df_vif = add_constant(df_vif)
for i in range(len(df_vif.columns)):
    print(df_vif.columns[i], vif(df_vif.values, i))


In [0]:
## none of the features returned a VIF > 5 so we'll keep all of them.

In [0]:
## train a model using sklearn ##
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error


X_train, X_test,y_train, y_test = train_test_split(df[column_list], df['price'], test_size=0.75, random_state = 0)

clf = LinearRegression()
clf.fit(X_train, y_train)
coeff_of_det_tr = clf.score(X_train, y_train)
coeff_of_det_tst = clf.score(X_test, y_test)

print(f"COD train: {coeff_of_det_tr}")
print(f"COD test: {coeff_of_det_tst}")     
preds = clf.predict(X_test)
mae   = mean_absolute_error(y_test, preds)
mse   = mean_squared_error(y_test, preds)
rmse  = mse**0.5

print("Metrics", mae, mse, rmse)
