In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

In [None]:
url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/regression/house_pricing.csv'
df = pd.read_csv(url)
df.head()

feature analysis and selection

In [None]:
# relation between area and price
plt.scatter(df.SquareFeet, df.Price)

check correlation between features and target when using linear models
- features are numerical
- predictors are numerical

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df[['SquareFeet']], df.Price)
pred = model.predict(df[['SquareFeet']])
plt.scatter(df.SquareFeet, df.Price)
plt.plot(df.SquareFeet, pred, color='red')
# pearson correlation
pc = df.SquareFeet.corr(df.Price)
print('Pearson Correlation:', pc)

In [None]:
df.select_dtypes(include='number').corrwith(df.Price)

pearson correlation coefficient
- if value close to 1, strong positive correlation
- if value close to -1, strong negative correlation
- if value close to 0, no correlation


In [None]:
df.select_dtypes(include='number').columns

In [None]:
num_cols = df.select_dtypes(include='number').columns
for col in num_cols:
    plt.scatter(df[col], df.Price)  
    plt.title(col)
    model = LinearRegression()
    model.fit(df[[col]], df.Price)
    plt.plot(df[col], model.predict(df[[col]]), color='red')
    plt.show()

- for categorical independent variables with numerical target
    - boxplot analysis (visual)
    - anova test
        - null hypothesis: means of the groups are equal
        - if p-value < 0.05, reject null hypothesis
        - if p-value > 0.05, fail to reject null hypothesis

In [None]:
cat_cols = ['Beds', 'Baths','City','Type']
for col in cat_cols:
    df.boxplot(column='Price', by=col, grid=False)
    plt.title(col)

# testing with anova
- for each feature, calculate the f-statistic and p-value
- if p-value < 0.05, reject null hypothesis (means the column is important)
- if p-value > 0.05, fail to reject null hypothesis (means the column is not important)

In [48]:
selected_cols = ['SquareFeet', 'Beds', 'Baths', 'Type']
X = df[selected_cols]
y = df.Price

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [57]:
X.head()

Unnamed: 0,SquareFeet,Beds,Baths,Type
0,836,2,1,Residential
1,1167,3,1,Residential
2,796,2,1,Residential
3,852,2,1,Residential
4,797,2,1,Residential


In [56]:
num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(include='object').columns
num_pipe = Pipeline([
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([   
    ('onehot', OneHotEncoder())
])
preprocessor = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
model = Pipeline([
    ('pre', preprocessor),
    ('lr', LinearRegression())
])
model

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [61]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
print("Training Results")
print("MSE:", mean_squared_error(y_train, y_train_pred))
print("MAE:", mean_absolute_error(y_train, y_train_pred))
print("R2:", r2_score(y_train, y_train_pred))
print('-'*50)
print("Testing Results")
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("R2:", r2_score(y_test, y_test_pred))

Training Results
MSE: 2847846914.371315
MAE: 26742.352927850403
R2: 0.7063148070699097
--------------------------------------------------
Testing Results
MSE: 1598312654.8771522
MAE: 27451.81035815512
R2: 0.8167434436906453
