## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

## Load Data

In [None]:
df_housing = pd.read_csv('../scrapers/csv/fotocasa.csv')
df_housing_2023 = pd.read_csv('../scrapers/csv/fotocasa_2023_located_districts.csv')

df_housing.shape, df_housing_2023.shape

## Preprocess

In [None]:
df_housing_2023.columns = df_housing_2023.columns.str.lower()

In [None]:
df_housing_2023['zipcode'] = df_housing_2023['zipcode'].fillna(0)
df_housing_2023['zipcode'] = df_housing_2023['zipcode'].astype(int)
df_housing['zipcode'] = df_housing['zipcode'].fillna(0)
df_housing['zipcode'] = df_housing['zipcode'].astype(int)

In [None]:
df_housing.head()

Going to delete the previous district column because after running the geolocate.py we get better accuracy

In [None]:
df_housing_2023 = df_housing_2023.rename(columns={'district_geolocated': 'district'})

In [None]:
print('Number of properties without price from df_housing: {}'.format(df_housing[df_housing['price'] == 0.0].shape[0]))
print('Number of properties without price from df_housing_2023: {}'.format(df_housing_2023[df_housing_2023['price'] == 0.0].shape[0]))

In [None]:
df_housing = df_housing[df_housing['price'] != 0.0]
df_housing_2023 = df_housing_2023[df_housing_2023['price'] != 0.0]

In [None]:
set(df_housing.columns) - set(df_housing_2023.columns)

Going to see if those {'conservationState', 'garden'} columns are correlated with the price of the property if not we going to remove it 

In [None]:
df_housing['district_encoded'] = le.fit_transform(df_housing['district'])
df_housing['neighbourhood_encoded'] = le.fit_transform(df_housing['neighbourhood'])

## Eda

In [None]:
df_housing['date'] = pd.to_datetime(df_housing['date'], format='ISO8601')
ax = df_housing['date'].dt.year.value_counts().sort_index().plot(kind='bar')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Properties')
df_housing['date'].dt.year.value_counts().sort_index().plot(kind='bar')

### Price

In [None]:
df_housing['price'].describe()

In [None]:
fig = plt.figure(figsize=(10, 4))

plt.title('Price distribution')
df_housing['price'].hist(bins=50)

plt.show()

In [None]:
fig = plt.figure(figsize=(6, 4))

plt.title('Price distribution by district')
df_housing.groupby(by='district')['price'].mean().sort_values(ascending=False).plot(kind='bar')
plt.savefig('../figs/price_distribution_by_district.png')
plt.show()

In [None]:
df_copy = df_housing[df_housing['surface'] != 0.0].reset_index(drop=True)
print(df_copy['surface'].describe())
print(df_housing['price'].describe())

In [None]:

fig = plt.figure(figsize=(6, 4))

plt.title('Correlation between price and surface')
sns.scatterplot(data=df_copy, x='rooms', y='price')

plt.show()

In [None]:
fig = plt.figure(figsize=(8, 8))

correlation_matrix = df_housing.select_dtypes(exclude='object').corr()
sns.heatmap(correlation_matrix, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
fig = plt.figure(figsize=(6, 4))

plt.title('Correlation between price and surface')
sns.scatterplot(data=df_housing_2023, x='surface', y='price')

plt.show()

In [None]:
fig = plt.figure(figsize=(6, 4))

plt.title('Correlation between price and surface')
sns.scatterplot(data=df_housing_2023, x='bathrooms', y='price')

plt.show()