In [None]:
import pandas as pd #library for working with tabular code("excel in code"). pd is a nickname for
                    #the library. One of the data structures panda has is called a series which uses a 1-d array to store data such as a column from a data sheet.
                    #Another data structure is DataFrame, which stores 2-D table like rows and columns in a variable.
                    #pandas also has a lot of functions that can deal with tablure data such as read/write csv,excel,and etc, 
                    #filtering and selecting data, transforming data, merging data, and etc.

import numpy as np #library for fast numric computing. In this code, this library is used to square root a variable, when coercing numbers, the bad values
                   #become np.nan in which the imputer in this code will fill.

#importing functions from the sklearn library
from sklearn.model_selection import train_test_split #function that takes data and splits it between 4 variables such as X_train, X_test, y_train, y_test, using a seed number to randomize
                                                     #consistently the data and splitting it so for example 20 percent of the data is used to test, and the rest 80% is used to train the model. The selction
                                                     #is randomized based on the seed but every time the code is ran, it is randomized in a specific way consistently.

from sklearn.linear_model import LinearRegression#class that uses the linear regression model. This model is good for approximating data linearly by training data and creating a linear line   
                                                 #equation (y = mx + b ) with a m and b value that best represents the data.

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score #functions to find an evaluation of how accurate the model is with the tested data.

from sklearn.pipeline import Pipeline #a class from the sklearn library to chain steps into one model. In this code I am preprocessing the data first in the chain then training the model 

from sklearn.impute import SimpleImputer #Since most models cannot handle missing information from the data it is being trained by or NaN, then the imputer can fill these missing pieces of info with
                                         #methods such as filling the missing piece with the median in the column or the most frequent string value and much more other methods.

from sklearn.preprocessing import OneHotEncoder#since linear models need numbers to train off of, when dealing with grouping prices by cities from the data sheet we encode the citys into 0/1 columns,
                                               #so for each city their is a column in which it is 1 if the row is that city or 0 if it isn't, allowing the model to train and differentiate citys. 
                                                
from sklearn.compose import ColumnTransformer #when processing the data in the datasheet, this class allows to apply different preprocessing processes to different columns of data such as data
                                              #that use int and data that use string. 

import joblib#library to save the model as a .pkl file

#LOADING
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#loading entire table or dataframe into a variable called df(short for data frame)
df = pd.read_csv("../data/housify-data.csv")

#variables to target specific columns which will be used when preprocessing the data
target = "price"

#storing only the columns that are actually in the data sheet and ensuring that these column names actually exist in the data sheet before preprocessing the information

num_features = [c for c in ["lat", "lon", "bedrooms", "bathrooms", "sqft"] if c in df.columns]#the c for c syntax is a short cut to store only the c variables that are true in this if statement into the array
cat_features = [c for c in ["city"] if c in df.columns]


#PREPROCESSING
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

for c in num_features: #iterate through each feature that exist in the housify-data.csv

    df[c] = pd.to_numeric(df[c], errors="coerce")#convert each entire column into either int64 or float64, for any empty or invalid values then will upcast to a float64 so it can be stored as a NaN.
                                                 #In addition when parsing the columns, if the parser gets in a error for a inavlid type then errors="coerce" tells to_numeric() function to make it NaN.


for c in cat_features:#iterate through each feature that exist in the housify-data.csv from the cat_features array(which is only one column being city)
    df[c] = df[c].astype("object")#set all the string values as a python object.


y = pd.to_numeric(df[target], errors="coerce")#setting the y of the linear regression equation relationship as the price since the x is all the different variables that effect the price, and y being the outcome of the price.
                                              #Here I am ensuring that the price column have valid integer type and set any invalid type to NaN.

mask = y.notna()#the notna() function is true when NaN is not present in the y(prices)

y = y.loc[mask]#removing the prices that are invalid from the data sheet.

X = df.loc[mask,num_features + cat_features]#combine the city column with the rest of the x columns and then removing the rows with no prices


numeric_transformer = Pipeline(steps=[ #numeric transformer is apart of the pipeline in which any missing numbers that are NaN when we coerced the integers, will be filled
                                       #with the median strategy by taking the median in the column and replacing the NaN.
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[#in this categorical pipeline transformer the citys with no value will be repalced with unknown, then a encoder will create 0/1 columns for each city
    ("imputer", SimpleImputer(strategy="constant",fill_value ="Unknown")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))#the handle_unknown = "ignore" tells encoder that if later on in the variable a unknown city that does not have a column gets mentioned in the x, set all the city values to 0.
])

preprocess = ColumnTransformer( #transform columns by doing the numeric transformations to the array with only integers involved, also categorical transformers are involved to the citys only
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ],
    remainder="drop"#a safety net removing any columns that we are not going to use in the model
)

#now under a variable called pipe, create a pipeline that preprocess the data and then puts all the data in a linear regression model
pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

# split data in variables, one are variables for training and one is for testing, they are randomized consistently with a random seed of 42, split the 20 percent testing data and 80 percent training data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#these next two lines is to ensure that when training and testing, the case where a city in the testing is not in the training. To resolve this problem, this new city that is not in the training data
#will be set as unknown in the encoder

#put known cities in a list
known_cities = X_train["city"].unique().tolist()

#if a test city is not in the trained data city list, then set that city as unknown so it can be encoded in the unknown column
X_test["city"] = X_test["city"].where(X_test["city"].isin(known_cities), "Unknown")

#train model under the pipe by first preprocessing data, then training.
pipe.fit(X_train, y_train)

#evaluating the model
y_pred = pipe.predict(X_test)#getting price prediction results after using x test data
mae  = mean_absolute_error(y_test, y_pred)#finding the mean absoluute error(average size of error), the average absolute difference between actual values(y_test) and predicted values(y_pred)

mse  = mean_squared_error(y_test, y_pred)#average of square difference between actual values(y_test) and predicted values(y_pred), used because it is sensitive to outliers

rmse = np.sqrt(mse)#by square rooting the mean sqaured error gives us a value that shows average error with big outliers affecting this number more, the reason for the sqaure root is because
                   #mean squared error also shows average price but the scale of the number is to large that it is hard to read and compare to prices, so by square rooting, scales the value down so it can
                   #be made sense of to prices.

r2   = r2_score(y_test, y_pred)#gives us a fraction of variation in the y that can be explained by x, so the higher the ratio the more the model explains variation. 

#printing results
print("🔹 Linear Regression (Pipeline) Results")
print(f"MAE:  ${mae:,.0f}")
print(f"RMSE: ${rmse:,.0f}")
print(f"R²:   {r2:.3f}")

#saving model under a .pkl file using joblib library
joblib.dump(pipe, "../backend/models/linear_regression_model.pkl")


🔹 Linear Regression (Pipeline) Results
MAE:  $519,045
RMSE: $1,142,393
R²:   0.356


['../models/linear_regression_model.pkl']

In [1]:
import sys
print(sys.executable)

c:\Users\chera\Documents\WEB DEV + ML\Housonify\venv\Scripts\python.exe
