In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
housing_df = pd.read_csv('Housing.csv')

In [None]:
housing_df.info()

In [None]:
housing_df.head()

In [None]:
housing_df['mainroad'].value_counts()

In [None]:
###Data Cleaning and preprocessing 

In [None]:
missing_values = housing_df.isnull().sum()
missing_values #no missing values in the dataset 

In [None]:
duplicate_rows = housing_df[housing_df.duplicated()]
print('Duplicate Rows: ' + duplicate_rows) #no duplicate rows in this dataset either

In [None]:
duplicate_values = housing_df.duplicated()
duplicate_values.value_counts() #no duplicate values 

In [None]:
columns_to_drop = ['prefarea', 'furnishingstatus'] #I consider that for a real estate project these 2 columns arent that big of a deal features
housing_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
housing_df.head(3)

In [None]:
# Convert categorical columns to proper data types
housing_df['mainroad'] = housing_df['mainroad'].astype('category')
housing_df['airconditioning'] = housing_df['airconditioning'].astype('category')

In [None]:
#data visualization

In [None]:
#Histogram of the Price for each property
plt.figure(figsize=(10, 6))
sns.histplot(housing_df['price'], bins=20, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frecuency')
plt.show()

In [None]:
#Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(housing_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Boxplot for categorical vs. numeric variable
plt.figure(figsize=(10, 6))
custom_colors = ["#ff1d15", "#02a9ea"]  # Red and green colors
sns.boxplot(x='mainroad', y='price', data=housing_df, palette=custom_colors)
plt.title('Mainroad vs. Price')
plt.xlabel('Mainroad')
plt.ylabel('Price')
plt.show()

In [None]:
#since we already know how our data looks like, we are going to do some changes to it.
#Removing outliers (extreme values) to get a more robust and solid analysis.

Q1 = housing_df['price'].quantile(.25)
Q3 = housing_df['price'].quantile(.75)
IQR = Q3 - Q1
housing_data = housing_df[(housing_df['price'] >= (Q1 - 1.5*IQR)) & (housing_df['price'] <= (Q3 + 1.5*IQR))]

In [None]:
housing_data

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(housing_data['price'], bins=20, kde=True)
plt.title('Price')
plt.xlabel('price')
plt.ylabel('frecuency')
plt.show()

In [None]:
median_price = housing_data['price'].median()
mean_price = housing_data['price'].mean()

formatted_median_price = "${:.2f}".format(median_price) 
formatted_mean_price = "${:.2f}".format(mean_price)

print('Median price of home in the dataset is: ', formatted_median_price)
print('Mean price of home in the dataset is: ', formatted_mean_price)

In [None]:
from sklearn.linear_model import LinearRegression

# Scatter plot with a line of best fit: Area vs. Price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='area', y='price', data=housing_data, label='Data Point')
# Fit a linear regression model
regressor = LinearRegression()
X = housing_data[['area']]
y = housing_data['price']
regressor.fit(X, y)

# Plot the regression line
plt.plot(X, regressor.predict(X), color='red', label='Line of Best Fit')

plt.title('Scatter Plot of Area vs. Price with Line of Best Fit')
plt.legend()
plt.show()