In [69]:
# Import dependencies
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd
import seaborn as sns
import plotly.express as px
from pandas.plotting import scatter_matrix
import pgeocode


In [70]:
# Import the data from the CSV file
mls_df = pd.read_csv("Clean_Data/mls_cleaned.csv")
mls_df.head()

Unnamed: 0,MLS #,City,Zip,Bedrooms,Total Baths,SqFt,Acres,Year Built,List Date,Closing Date,List Price,Sold Price,Days on Market,Over Asking,Lat,Lon,Cluster
0,2442914,Chapel Hill,27516,3,2,1782,1,2013,2022-04-16,2022-06-30,1000000,1500000,75,500000,35.9162,-79.0999,1
1,2437799,Chapel Hill,27516,4,4,3157,1,2013,2022-03-21,2022-04-05,1600000,2000000,15,400000,35.9162,-79.0999,1
2,2498024,Chapel Hill,27516,3,3,2183,1,2013,2023-03-05,2023-03-24,1400000,1650000,19,250000,35.9162,-79.0999,1
3,2444529,Chapel Hill,27516,2,2,1128,5,2013,2022-04-25,2022-05-24,589900,755000,29,165100,35.9162,-79.0999,1
4,2443429,Raleigh,27612,2,2,1745,1,2006,2022-04-19,2022-04-29,450000,565000,10,115000,35.852,-78.6841,0


In [71]:
#List all the values in the Property Type column
mls_df['Property Type'].value_counts()

KeyError: 'Property Type'

In [None]:
# Copy the data frame to a new data frame
condo_df = mls_df.copy()

# Keep only the 'Single Family Residence' values in column 'Property Type'
condo_df = condo_df[condo_df['Property Type'] == 'Condo']

# Drop the 'Property Type' column
condo_df = condo_df.drop(columns=['Property Type'])


condo_df.head()

In [None]:
condo_df.describe()

In [None]:
condo_df.info()

In [None]:
#Create a copy of the data frame and drop the non-numeric columns
condo_nostr_df = condo_df.copy()
condo_nostr_df = condo_nostr_df.drop(columns=['City'])
condo_nostr_df = condo_nostr_df.drop(columns=['List Date'])
condo_nostr_df = condo_nostr_df.drop(columns=['Closing Date'])
condo_nostr_df = condo_nostr_df.drop(columns=['MLS #'])

#Create a correlation matrix
corr_matrix = condo_nostr_df.corr()
corr_matrix["Sold Price"].sort_values(ascending=False)

In [None]:
#Visualize the scatter matrix
scatter_columns = ["Sold Price", "SqFt", "Total Baths", "Bedrooms", 'Lat', 'Lon']
scatter_matrix(condo_nostr_df[scatter_columns], figsize=(12, 8))
#plt.savefig('matrix.png')

In [None]:
#Visualize the scatter plot for Total Living Area SqFt vs Sold Price
condo_nostr_df.plot(kind="scatter", x="SqFt", y="Sold Price", alpha=0.5)
#plt.savefig('scatter.png')

In [None]:
#Create a new column for the price per square foot
condo_nostr_df['Price per SqFt'] = condo_nostr_df['Sold Price'] / condo_nostr_df['SqFt']

#Create a correlation matrix to see what the correlation is for Price per SqFt
corr_matrix = condo_nostr_df.corr()
corr_matrix["Sold Price"].sort_values(ascending=False)

In [None]:
#Check the orginal data frame to see how many cities are in the data
len(condo_df['City'].value_counts())

In [None]:
#Cluster the Lat and  Lon into groups
from sklearn.cluster import KMeans
condo_df['Cluster'] = KMeans(n_clusters=5).fit_predict(condo_df[['Lat', 'Lon']])
condo_df.plot(kind="scatter", x="Lon", y="Lat", c="Cluster", cmap='viridis', colorbar=False, alpha=0.5)
plt.title('Clustered Lat and Lon')
plt.savefig('clustered.png')


In [None]:
#Create a variable for the top 10 cities with the highest average sold price
top10 = condo_df.groupby('City')['Sold Price'].mean().nlargest(373).reset_index()

#Plot the top 10 cities with the highest average sold price
fig = px.bar(top10, x='City', y='Sold Price',color='City', template='plotly',
             title='Cities with Highest Avg. Sold Price') 
fig.update_traces(textposition='outside')
fig.show()

In [None]:
top10 = condo_df.groupby('City')['Total Baths'].mean().nlargest(10).reset_index()

fig = px.bar(top10, x='City', y='Total Baths',color='Total Baths', template='plotly', 
             title='Top 10 Cities with the Most Avg Total Baths')
fig.update_traces(texttemplate='%{y:.}', textposition='outside')
fig.show()

In [None]:
df_mean = condo_df.groupby('Over Asking')['SqFt'].mean().reset_index()

fig = px.scatter(df_mean, x='Over Asking', y='SqFt', trendline='ols', 
                 labels={'Over Asking':'Over Asking', 'SqFt':'Mean House Size'})
fig.update_layout(title='Distribution of Mean House Size by Over Asking')
fig.show()

In [None]:
fig = px.scatter(condo_df, x='SqFt', y='Over Asking', color='Bedrooms', size='Total Baths',trendline='ols')
fig.update_layout(title='House Size vs Over Asking',
                  xaxis_title='House Size',
                  yaxis_title='Over Asking')
fig.show()

In [None]:
condo_df.hist(bins=5, figsize=(20,15))
#plt.savefig("attribute_histogram_plots")
plt.show()

In [None]:
condo_df.head()

In [None]:
# Variable with the Longitude and Latitude
X=condo_df.loc[:,['MLS #','Lat','Lon']]
X.head(10)

In [None]:
K_clusters = range(1,10)
kmeans = [KMeans(n_clusters=i) for i in K_clusters]
Y_axis = condo_df[['Lat']]
X_axis = condo_df[['Lon']]
score = [kmeans[i].fit(Y_axis).score(Y_axis) for i in range(len(kmeans))]
# Visualize
plt.plot(K_clusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 4, init ='k-means++')
kmeans.fit(X[X.columns[1:3]]) # Compute k-means clustering.
X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]])
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
labels = kmeans.predict(X[X.columns[1:3]]) # Labels of each point
X.head(10)

In [None]:
X.plot.scatter(x = 'Lat', y = 'Lon', c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)

In [None]:
#Visualize the distribution of each numerical feature using box plots. Points beyond the “whiskers” of the box plot can be considered potential outliers.  
plt.figure(figsize=(8, 6)) 
sns.boxplot(x=condo_df['SqFt'], color='lightgreen')

In [None]:
plt.figure(figsize=(8, 6)) 
sns.boxplot(x=condo_df['Bedrooms'], color='lightgreen')

In [None]:
plt.figure(figsize=(8, 6)) 
sns.boxplot(x=condo_df['Total Baths'], color='lightgreen')

In [None]:
plt.figure(figsize=(8, 6)) 
sns.boxplot(x=condo_df['Acres'], color='lightgreen')

In [None]:
plt.figure(figsize=(8, 6)) 
sns.boxplot(x=condo_df['Days on Market'], color='lightgreen')

In [None]:
plt.figure(figsize=(8, 6)) 
sns.boxplot(x=condo_df['Year Built'], color='lightgreen')

In [None]:
#Find the row with the value of 44 in the 'Bedrooms' column
condo_df.loc[condo_df['Bedrooms'] == 44]

#Drop the row with the value of 44 in the 'Bedrooms' column
condo_df = condo_df[condo_df['Bedrooms'] != 44]


In [None]:
#Find the row with the most 'Total Baths'
max_idx = condo_df['Bedrooms'].idxmax()
row_with_max_total_baths = condo_df.loc[max_idx]

row_with_max_total_baths

In [None]:
#Drop the row with the value of 44 in the 'Bedrooms' column
condo_df = condo_df[condo_df['Bedrooms'] != 43]
condo_df = condo_df[condo_df['Bedrooms'] != 0]


In [None]:
#Find the row with the most 'Total Baths'
max_idx = condo_df['Bedrooms'].idxmax()
row_with_max_total_baths = condo_df.loc[max_idx]

row_with_max_total_baths

In [None]:
#Find the row with the most 'Total Baths'
max_idx = condo_df['Total Baths'].idxmax()
row_with_max_total_baths = condo_df.loc[max_idx]

row_with_max_total_baths

In [None]:
#Drop the row with the value of 34 in the 'Total Baths' column
condo_df = condo_df[condo_df['Total Baths'] != 34]
condo_df = condo_df[condo_df['Total Baths'] != 0]

In [None]:
#Find the row with the most 'Total Baths'
max_idx = condo_df['Total Baths'].idxmax()
row_with_max_total_baths = condo_df.loc[max_idx]

row_with_max_total_baths

In [None]:
#Drop the row with the value of 34 in the 'Total Baths' column
condo_df = condo_df[condo_df['Total Baths'] != 32]


In [None]:
#Find the row with the most 'Total Baths'
max_idx = condo_df['Total Baths'].idxmax()
row_with_max_total_baths = condo_df.loc[max_idx]

row_with_max_total_baths

In [None]:
#Drop the row with the value of 26 in the 'Total Baths' column
condo_df = condo_df[condo_df['Total Baths'] != 26]

In [None]:
#Find the row with the most 'Total Baths'
max_idx = condo_df['Total Baths'].idxmax()
row_with_max_total_baths = condo_df.loc[max_idx]

row_with_max_total_baths

In [None]:
#Drop the row with the value of 26 in the 'Total Baths' column
condo_df = condo_df[condo_df['Total Baths'] != 23]

In [None]:
#Find the row with the most 'Total Baths'
max_idx = condo_df['Total Baths'].idxmax()
row_with_max_total_baths = condo_df.loc[max_idx]

row_with_max_total_baths

In [None]:
#Find the row with the most 'Total Baths'
max_idx = condo_df['Year Built'].idxmax()
row_with_max_total_baths = condo_df.loc[max_idx]

row_with_max_total_baths

In [None]:
condo_df.sort_values('Year Built', ascending=True)

In [None]:
condo_df = condo_df[condo_df['Year Built'] != 0]
condo_df = condo_df[condo_df['Year Built'] != 1]
condo_df = condo_df[condo_df['Year Built'] != 3]
condo_df = condo_df[condo_df['Year Built'] != 97]
condo_df = condo_df[condo_df['Year Built'] != 202]

condo_df.sort_values('Year Built', ascending=True)

In [None]:
condo_df.sort_values('Year Built', ascending=False)

In [None]:
# Edit the first occurrence of '3837737' in the 'List Price' column to '383737'
index = (condo_df['Year Built'] == 20223).idxmax()
condo_df.loc[index, 'Year Built'] = 2023

In [None]:
condo_df.sort_values('Year Built', ascending=False)

In [None]:
index = (condo_df['Year Built'] == 20211).idxmax()
condo_df.loc[index, 'Year Built'] = 2021

In [None]:
condo_df.sort_values('Year Built', ascending=False)

In [None]:
index = (condo_df['Year Built'] == 20116).idxmax()
condo_df.loc[index, 'Year Built'] = 2016

In [None]:
condo_df.sort_values('Year Built', ascending=False)

In [None]:
index = (condo_df['Year Built'] == 20023).idxmax()
condo_df.loc[index, 'Year Built'] = 2023

In [None]:
condo_df.sort_values('Year Built', ascending=False)

In [None]:
index = (condo_df['Year Built'] == 20006).idxmax()
condo_df.loc[index, 'Year Built'] = 2006

In [None]:
condo_df.sort_values('Year Built', ascending=False)


In [None]:
index = (condo_df['Year Built'] == 19496).idxmax()
condo_df.loc[index, 'Year Built'] = 1949

In [None]:
condo_df.sort_values('Year Built', ascending=False)


In [None]:
condo_df = condo_df[condo_df['Year Built'] != 3023]
condo_df = condo_df[condo_df['Year Built'] != 2083]
condo_df = condo_df[condo_df['Year Built'] != 2121]



#Remove all the rows where the 'SqFt' is less than 100
condo_df = condo_df[condo_df['SqFt'] > 199]

condo_df.sort_values('Year Built', ascending=False)


In [None]:
condo_df.sort_values('Year Built', ascending=True)


In [None]:
condo_df = condo_df[condo_df['Year Built'] != 1073]
condo_df = condo_df[condo_df['Year Built'] != 1577]
condo_df = condo_df[condo_df['Year Built'] != 1647]

In [None]:
condo_df.sort_values('Year Built', ascending=False)


In [None]:
#Add the Price per SqFt to this data frame
condo_df["Price per SqFt"] = condo_df["Sold Price"]/condo_df["SqFt"]
condo_df.head()

In [None]:
# Save the DataFrame to a CSV file
condo_df.to_csv('Clean_Data/mls_condo_cleaned_2.csv', index=False)

In [None]:
condo_df.sort_values('Zip', ascending=False)
