In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from datetime import datetime

In [2]:
#Read in the cleaned data
housing_data = pd.read_csv("Clean_Data/mls_condo_cleaned.csv")
housing_data.head()

Unnamed: 0,MLS #,City,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Date,Closing Date,List Price,Sold Price,Days on Market,Over Asking,Lat,Lon,Cluster
0,2442914,Chapel Hill,27516,3,2,1782,1,2013,2022-04-16,2022-06-30,1000000,1500000,75,500000,35.9162,-79.0999,1
1,2437799,Chapel Hill,27516,4,4,3157,1,2013,2022-03-21,2022-04-05,1600000,2000000,15,400000,35.9162,-79.0999,1
2,2498024,Chapel Hill,27516,3,3,2183,1,2013,2023-03-05,2023-03-24,1400000,1650000,19,250000,35.9162,-79.0999,1
3,2444529,Chapel Hill,27516,2,2,1128,5,2013,2022-04-25,2022-05-24,589900,755000,29,165100,35.9162,-79.0999,1
4,2443429,Raleigh,27612,2,2,1745,1,2006,2022-04-19,2022-04-29,450000,565000,10,115000,35.852,-78.6841,0


In [3]:
# Convert date columns to datetime
housing_data['List Date'] = pd.to_datetime(housing_data['List Date'])
housing_data['Closing Date'] = pd.to_datetime(housing_data['Closing Date'])

In [4]:
# Extracting year and month from List Date and Closing Date
housing_data['List Year'] = housing_data['List Date'].dt.year
housing_data['List Month'] = housing_data['List Date'].dt.month
housing_data['Closing Year'] = housing_data['Closing Date'].dt.year
housing_data['Closing Month'] = housing_data['Closing Date'].dt.month

housing_data.head()

Unnamed: 0,MLS #,City,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Date,Closing Date,...,Sold Price,Days on Market,Over Asking,Lat,Lon,Cluster,List Year,List Month,Closing Year,Closing Month
0,2442914,Chapel Hill,27516,3,2,1782,1,2013,2022-04-16,2022-06-30,...,1500000,75,500000,35.9162,-79.0999,1,2022,4,2022,6
1,2437799,Chapel Hill,27516,4,4,3157,1,2013,2022-03-21,2022-04-05,...,2000000,15,400000,35.9162,-79.0999,1,2022,3,2022,4
2,2498024,Chapel Hill,27516,3,3,2183,1,2013,2023-03-05,2023-03-24,...,1650000,19,250000,35.9162,-79.0999,1,2023,3,2023,3
3,2444529,Chapel Hill,27516,2,2,1128,5,2013,2022-04-25,2022-05-24,...,755000,29,165100,35.9162,-79.0999,1,2022,4,2022,5
4,2443429,Raleigh,27612,2,2,1745,1,2006,2022-04-19,2022-04-29,...,565000,10,115000,35.852,-78.6841,0,2022,4,2022,4


In [5]:
# Handling outliers
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

for col in ['List Price', 'Sold Price', 'SqFt', 'Days on Market', 'Price per SqFt']:
    handle_outliers(housing_data, col)

housing_data.head()

KeyError: 'Price per SqFt'

In [None]:
# Feature Engineering
housing_data['Price Increase Percentage'] = (housing_data['Sold Price'] - housing_data['List Price']) / housing_data['List Price']
housing_data['Age of Property'] = housing_data['Closing Year'] - housing_data['Year Built']

housing_data.head()

In [None]:
# Encoding categorical variables
encoder = OneHotEncoder(drop='first')
encoded_cities = encoder.fit_transform(housing_data[['City']]).toarray()  
encoded_cities_df = pd.DataFrame(encoded_cities, columns=encoder.get_feature_names_out(['City']))

housing_data = pd.concat([housing_data, encoded_cities_df], axis=1)
housing_data.drop('City', axis=1, inplace=True)

housing_data.head()

In [None]:
# Scaling numerical features
scaler = StandardScaler()
numerical_features = ['List Price', 'Sold Price', 'SqFt', 'Acres', 'Days on Market', 'Price per SqFt', 'Price Increase Percentage', 'Age of Property']
housing_data[numerical_features] = scaler.fit_transform(housing_data[numerical_features])

housing_data.head()

In [None]:
#Drop the MLS # Column
housing_data.drop('MLS #', axis=1, inplace=True)

In [None]:
# Clustering based on geographical location
kmeans = KMeans(n_clusters=5, random_state=42)
housing_data['Geo Cluster'] = kmeans.fit_predict(housing_data[['Lat', 'Lon']])

housing_data.head()

In [None]:
# Target variable analysis
plt.figure(figsize=(8, 6))
sns.histplot(housing_data['Sold Price'], kde=True)
plt.title('Distribution of Sold Price')
plt.show()

In [None]:
# Log transformation if skewed
housing_data['Log Sold Price'] = np.log1p(housing_data['Sold Price'])

plt.figure(figsize=(8, 6))
sns.histplot(housing_data['Log Sold Price'], kde=True)
plt.title('Distribution of Log Sold Price')
plt.show()

In [None]:
#Show all the values in the Sold Price column
housing_data['Sold Price']

In [None]:
#Create a correlation matrix to see what the correlation is for Price per SqFt
corr_matrix = housing_data.corr()
corr_matrix["Sold Price"].sort_values(ascending=False)

In [None]:
# Save the DataFrame to a CSV file
housing_data.to_csv('Clean_Data/mls_condo_cleaned_3a.csv', index=False)