## Regression Project

In [None]:
import pandas as pd

### Define Dataset

In [None]:
df = pd.read_csv("housePrice.csv")
df.head(3)

In [None]:
df.info()

In [None]:
df = df.dropna()

### Preprocessing

#### On **Area** Feature

In [None]:
# convert the value of "Area" from object to int64 and if its non-convertible the value will be nan.
df["Area"] = pd.to_numeric(df["Area"], errors="coerce")
df = df.dropna()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(df["Area"], df["Price(USD)"], color="green")
plt.xlabel("Area")
plt.ylabel("Price")
plt.show()

In [None]:
# this cell drops outlier datas, I determined that if value of "Area" is more than 400 It is outlier.
import numpy as np

Q1 = df["Area"].quantile(0.25)
Q3 = df["Area"].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 6.2 * IQR

df["Area"] = df["Area"].where(df["Area"] <= upper_bound, np.nan) # value if less than 400 else nan.
df = df.dropna()

df.info()

In [None]:
plt.scatter(df["Area"], df["Price(USD)"], color="green")
plt.xlabel("Area")
plt.ylabel("Price")
plt.show()

#### On **Room** Feature

In [None]:
room_counter = df["Room"].value_counts().sort_index()
room_counter

In [None]:
colors = ["purple", "blue", "green", "yellow", "orange", "red"]
plt.bar(room_counter.index, room_counter.values, color=colors)
plt.xlabel("room".title())
plt.ylabel("count".title())
plt.show()

#### on **Parking**, **Warehouse**, **Elevator** columns

In [None]:
# convert boolean values to int64.
df["Parking"] = df["Parking"].astype(int)
df["Warehouse"] = df["Warehouse"].astype(int)
df["Elevator"] = df["Elevator"].astype(int)

df.isna().sum()

#### on **Address** column

In [None]:
# achieve classes of "Address" to choose the way of encoding (Lable Encoder, One Hot Encoding or Target Encoding).
df.groupby("Address").count()

In [None]:
# split data to train and test.
msk = np.random.rand(len(df)) < 0.8
train, test = df[msk], df[~msk]

train.shape, test.shape

In [None]:
# I used target encoding to encode "Address".
train = train.copy() # because of warning message.
test = test.copy()

mean_prices = train.groupby("Address")["Price(USD)"].mean()
train["Address_Encoded"] = train["Address"].map(mean_prices)
test["Address_Encoded"] = test["Address"].map(mean_prices) # encode address for test data with the means of train data.
test = test.dropna()

global_price = train["Price(USD)"].mean()

train.drop("Address", axis=1, inplace=True) # we dont need "Address" anymore.
test.drop("Address", axis=1, inplace=True)
train.head(0)

In [None]:
plt.scatter(train["Address_Encoded"], train["Price(USD)"])
plt.scatter(test["Address_Encoded"], test["Price(USD)"])
plt.xlabel("Address")
plt.ylabel("Price")
plt.show()

In [None]:
# to see correlation between features and label, it is important to use train data for this process.
import seaborn as sns

corr = train.corr(numeric_only=True)
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
# we use features with hight correlations for training and testing.
x_train = np.asanyarray(train[["Area", "Room", "Parking", "Address_Encoded"]])
y_train = np.asanyarray(train[["Price(USD)"]])

x_test = np.asanyarray(test[["Area", "Room", "Parking", "Address_Encoded"]])
y_test = np.asanyarray(test[["Price(USD)"]])

In [None]:
# this cell normalizes train datas.
from sklearn.preprocessing import StandardScaler

x_scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
y_scaler = StandardScaler(copy=False, with_mean=True, with_std=True)

x_scaler.fit_transform(x_train); x_scaler.transform(x_test)
y_scaler.fit_transform(y_train); y_scaler.transform(y_test)

## Polynomial Regression

#### Model Definition

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test) # transform test data for polynomial model with train metrics.

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train_poly, y_train)

### Model Evaluation

In [None]:
y_pred = model.predict(x_test_poly)

In [None]:
from sklearn.metrics import r2_score

mse = np.mean((y_test - y_pred) ** 2)
mae = np.mean(np.absolute(y_test - y_pred))
r2 = r2_score(y_test, y_pred)

print(f"mean squared error : {mse} | mean absolute error : {mae} | r2 score : {r2}".title())

In [None]:
from tkinter import *
from tkinter import ttk

In [None]:
# I used tkinter to recieve information from user and show the result to user.
def encode_address(address):
    return mean_prices.get(address, global_price)

def predict():
    try:
        area, room, parking, address = float(entry1.get()), int(entry2.get()), combo_box.get(), entry4.get()
        parking = 1 if parking == "Yes" else 0
        address_encoded = encode_address(address)
        x = np.asanyarray([[area, room, parking, address_encoded]])
        x_scaler.transform(x)
        x_poly = poly.transform(x)
        y = model.predict(x_poly)
        y_scaler.inverse_transform(y)
        label_result.config(text=f"my prediction : {y[0][0]:.3f} $".title())
    except ValueError:
        label_result.config(text="invalid input!".title())


root = Tk()
root.title("Predictor")
root.geometry("350x300")
label1 = Label(root, text= "House area (mÂ²) : ").place(x= 10, y= 20) # Enter Area as a number.
label2 = Label(root, text= "Number of rooms : ").place(x= 10, y= 60) # Enter number of rooms as e integer.
label3 = Label(root, text= "Parking available: (Yes / No) : ").place(x= 10, y= 100) # Yes if it has parking else no.
label4 = Label(root, text= "Neighborhood (in Tehran) : ").place(x= 10, y= 140) # Enter the neghborhood.
entry1 = Entry(root, width= 21)
entry1.place(x= 185, y= 20)
entry2 = Entry(root, width= 21)
entry2.place(x= 185, y= 60)
combo_box = ttk.Combobox(root, values= ["Yes", "No"], width= 18)
combo_box.place(x= 185, y= 100)
entry4 = Entry(root, width= 21)
entry4.place(x= 185, y= 140)
button1 = Button(root, text= "Predict", width= 45, command= predict, activebackground= "green", activeforeground= "red")
button1.place(x= 15, y= 180)
label_result = Label(root, text= "result .....", bg= "lightgray", font='Helvetica 15 bold')
label_result.place(x= 10, y= 218)
button2 = Button(root, text= "Done", command= root.destroy, width= 45, activebackground= "black", activeforeground= "white").place(x= 15, y= 260)
root.mainloop()