In [196]:
import pandas as pd
from pandas import DataFrame

dataset: DataFrame = pd.read_csv("data/house_pricing_data.csv")

X_train: DataFrame = dataset.drop("PRICE", axis=1)
y_train: DataFrame = dataset["PRICE"]

X_test: DataFrame = pd.read_csv("data/testing_data.csv")
y_test: DataFrame = pd.read_csv("data/testing_prices.csv")

In [197]:
X_train

Unnamed: 0.1,Unnamed: 0,TYPE,BEDS,BATH,PROPERTYSQFT,SUBLOCALITY
0,0,Condo,2,2.0,1400.000000,Manhattan
1,9,Co-op,2,1.0,978.000000,Brooklyn
2,12,Co-op,3,1.0,2184.207862,Queens
3,13,Co-op,2,2.0,2184.207862,Queens
4,15,Condo,2,2.0,1000.000000,Staten Island
...,...,...,...,...,...,...
795,3859,Condo,3,2.0,1709.000000,Brooklyn
796,3860,Condo,3,1.0,2184.207862,Queens
797,3861,Co-op,1,1.0,2184.207862,Queens
798,3864,Condo,3,1.0,393.000000,Brooklyn


In [198]:
y_train

0       315000
1       440000
2       259000
3       430000
4       549000
        ...   
795    2950000
796     358000
797     189999
798     380000
799     380000
Name: PRICE, Length: 800, dtype: int64

In [199]:

from sklearn.preprocessing import LabelEncoder

le: LabelEncoder = LabelEncoder()

columns = ["TYPE", "SUBLOCALITY"]

for column in columns:
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.fit_transform(X_test[column])

X_train

Unnamed: 0.1,Unnamed: 0,TYPE,BEDS,BATH,PROPERTYSQFT,SUBLOCALITY
0,0,1,2,2.0,1400.000000,1
1,9,0,2,1.0,978.000000,0
2,12,0,3,1.0,2184.207862,2
3,13,0,2,2.0,2184.207862,2
4,15,1,2,2.0,1000.000000,3
...,...,...,...,...,...,...
795,3859,1,3,2.0,1709.000000,0
796,3860,1,3,1.0,2184.207862,2
797,3861,0,1,1.0,2184.207862,2
798,3864,1,3,1.0,393.000000,0


In [200]:
from numpy import ndarray
# Build a regressive model to predict the price of a house
# based on the features in the dataset
from sklearn.linear_model import LinearRegression

model: LinearRegression = LinearRegression()

model.fit(X_train, y_train)

predictions: ndarray = model.predict(X_test)

predictions: DataFrame = DataFrame(predictions, columns=["PRICE"])

In [201]:
# Let's compare the predictions with the actual prices using MAPE
from sklearn.metrics import mean_absolute_percentage_error

mape: float = mean_absolute_percentage_error(y_test["PRICE"], predictions["PRICE"])

mape

0.6449297918274284

In [202]:
# In order to be compatible with Kaggle, our output requires "Id" and "Usage" columns.
# So, we'll need to make "Usage" = "Prices" and "Id" = the index of the testing data
output: DataFrame = DataFrame({"ID": predictions.index, "Usage": predictions["PRICE"]})

output


Unnamed: 0,ID,Usage
0,0,5.344898e+05
1,1,1.135791e+06
2,2,9.646607e+05
3,3,1.934587e+06
4,4,4.530639e+05
...,...,...
218,218,6.974491e+05
219,219,1.302578e+06
220,220,7.125227e+05
221,221,4.588214e+05


In [203]:
output.to_csv("data/output.csv")

In [204]:
# Now, we need to format `y_test` to be compatible with Kaggle
y_test = DataFrame({"ID": y_test.index, "Usage": y_test["PRICE"]})

y_test.to_csv("data/y_test.csv", index=False)

y_test

Unnamed: 0,ID,Usage
0,0,379000
1,1,649000
2,2,475000
3,3,3995000
4,4,579000
...,...,...
218,218,370000
219,219,2500000
220,220,285000
221,221,598125
