Data Science PROJECT

BCS-7A

Predicting House Prices in USA based on Real Estate Data

GROUP 3

Faraz Majid 20L-1162

Aemon Fatima 20L-1057

Ahmad Abdullah Dhami 20L-1226

### Data Collection:

In [1]:
import pandas as pd
import numpy as np

In [2]:
housePrice = pd.read_csv('./raw_data.csv')
print(housePrice.shape)
housePrice = housePrice.drop(housePrice.columns[0],axis=1)

housePrice.head()

(5000, 19)


Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-06-27 00:00:00,300000.0,3.0,1.5,1590,8911,1.0,0,0,3,1590,0,1956,2001,6615 NE 154th St,Kenmore,WA 98028,USA
1,2014-07-09 00:00:00,734950.0,4.0,3.25,4280,47179,2.0,0,0,3,3050,1230,2002,0,33518 161st Ln SE,Auburn,WA 98092,USA
2,2014-06-04 00:00:00,539000.0,3.0,2.5,1710,2300,2.0,0,0,3,1570,140,2005,0,7874 148th Ct NE,Redmond,WA 98052,USA
3,2014-07-08 00:00:00,232000.0,3.0,1.5,1460,15000,1.0,0,0,3,1460,0,1966,1963,31605-31625 51st Ave S,Auburn,WA 98001,USA
4,2014-05-28 00:00:00,600000.0,4.0,2.0,2510,38141,1.0,0,0,3,2510,0,1960,2012,13507 62nd Ave NE,Kirkland,WA 98034,USA


In [3]:
# Find and count NA values in the DataFrame
na_values = housePrice.isna().sum()

# Print the NA values count
print("NA Values Count:")
print(na_values)

NA Values Count:
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64


### Data Preprocessing:

In [4]:
housePrice.nunique()

date               70
price            1899
bedrooms           10
bathrooms          26
sqft_living       566
sqft_lot         3113
floors              6
waterfront          2
view                5
condition           5
sqft_above        511
sqft_basement     207
yr_built          115
yr_renovated       60
street           4525
city               44
statezip           77
country             1
dtype: int64

In [5]:
"""
    -> we don't need Country as All values are of the same country USA
    -> Date column does not provide any useful information
    -> Street is also unique for every house so it does't effect price 
"""
housePrice = housePrice.drop(['country','date','street','statezip'],axis=1)

In [6]:
"""
    In order to take into account city specific trends,
    we will encode city as a Categorical Feature,
    
    Since, there is no inherent ordinal relationship between 
    the categories (cities, in this case). We will use 
    One-Hot encoding.    
"""
housePrice = pd.get_dummies(housePrice, columns=['city'], prefix='city', drop_first=True)

In [7]:
# Calculate the correlation matrix with the "price" column
correlation_with_price = housePrice.corr()['price']

# Sort the correlations with "price" in descending order
sorted_correlation_with_price = correlation_with_price.sort_values(ascending=False)

# Print the sorted correlations with "price"
print("Correlation with 'price' (Descending Order):")
sorted_correlation_with_price.head(15)

Correlation with 'price' (Descending Order):


price                 1.000000
sqft_living           0.438167
sqft_above            0.372277
bathrooms             0.333876
view                  0.232300
sqft_basement         0.216354
bedrooms              0.200385
floors                0.156114
city_Mercer Island    0.143157
city_Medina           0.137532
city_Bellevue         0.135997
waterfront            0.129481
city_Clyde Hill       0.066019
sqft_lot              0.051770
city_Sammamish        0.048874
Name: price, dtype: float64

In [8]:
housePrice.to_csv('preprocessed.csv')
housePrice.reset_index(drop=True,inplace=True)
print(housePrice.shape)
housePrice.head(10)

(5000, 56)


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,...,city_SeaTac,city_Seattle,city_Shoreline,city_Skykomish,city_Snoqualmie,city_Snoqualmie Pass,city_Tukwila,city_Vashon,city_Woodinville,city_Yarrow Point
0,300000.0,3.0,1.5,1590,8911,1.0,0,0,3,1590,...,0,0,0,0,0,0,0,0,0,0
1,734950.0,4.0,3.25,4280,47179,2.0,0,0,3,3050,...,0,0,0,0,0,0,0,0,0,0
2,539000.0,3.0,2.5,1710,2300,2.0,0,0,3,1570,...,0,0,0,0,0,0,0,0,0,0
3,232000.0,3.0,1.5,1460,15000,1.0,0,0,3,1460,...,0,0,0,0,0,0,0,0,0,0
4,600000.0,4.0,2.0,2510,38141,1.0,0,0,3,2510,...,0,0,0,0,0,0,0,0,0,0
5,530000.0,3.0,3.5,2320,3174,2.0,0,0,3,2060,...,0,0,0,0,0,0,0,0,0,0
6,190000.0,5.0,2.0,1750,10284,1.0,0,0,4,1750,...,0,0,0,0,0,0,0,0,0,0
7,295000.0,3.0,1.75,1940,7500,1.5,0,0,4,1940,...,0,0,0,0,0,0,0,0,0,0
8,950000.0,5.0,3.75,5330,6000,2.0,0,2,3,3570,...,0,0,0,0,0,0,0,0,0,0
9,345000.0,3.0,1.75,1990,5650,1.0,0,1,3,1320,...,0,1,0,0,0,0,0,0,0,0
