## Import all necesary libraries

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 1. Share the dataset source link, a brief description of its content and your motivation for choosing it.
### Link: <br>
"https://opendata-ajuntament.barcelona.cat/data/es/dataset/est-cadastre-habitatges-superficie-mitjana/resource/d309f5d0-06e1-47e2-accd-e0e2f08e275b"

### Description and motivation: 
It's a csv file from the open data of the the Ajuntament de Barcelona official website. It contains the details of the average surface area of properties in the city of Barcelona whose main use and purpose is housing. I've chosen this topic because i'm a international student that is looking for housing right now, and my main scopes are location and square feet (also price, but since this is an official dataset from Barcelona's Townhall, they can't really give market speculation numbers). <p>
It's a fun topic that comes in handy with my current situation




### 2. Load the dataset using Pandas.


In [79]:
# Load the dataset
barcelona_houses_2025_df = pd.read_csv('2025_properties.csv')

# Display the first 5 rows to ensure the format is correct
barcelona_houses_2025_df.head()

Unnamed: 0,Year,Code_district,Name_district,Code_barrio,Name_barrio,Seccion_censal,Sup_media_m2
0,2025,1,Ciutat Vella,1,el Raval,1,76.7
1,2025,1,Ciutat Vella,1,el Raval,2,68.1
2,2025,1,Ciutat Vella,1,el Raval,3,59.1
3,2025,1,Ciutat Vella,1,el Raval,4,65.4
4,2025,1,Ciutat Vella,1,el Raval,5,76.6


In [68]:
#Lets also see if it meets the requirements: between 500 and 2000 rows and between 5 and 10 columns
barcelona_houses_2025_df.shape

(1068, 9)

### 3. Make some basic data profiling (statistics about the data, distributions, missing values, etc.) using Pandas methods.

In [None]:
#Lets check the basic numeric stats of the datset with describe()
barcelona_houses_2025_df.describe().round(1)

Unnamed: 0,Year,Cod_district,Code_barrio,Seccion_censal,Sup_media_m2
count,1068.0,1068.0,1068.0,1068.0,1068.0
mean,2025.0,5.7,33.2,60.6,77.9
std,0.0,2.9,21.9,41.6,20.1
min,2025.0,1.0,1.0,1.0,36.1
25%,2025.0,3.0,13.0,27.8,66.3
50%,2025.0,6.0,31.0,54.0,73.5
75%,2025.0,8.0,52.0,87.0,82.8
max,2025.0,10.0,73.0,237.0,233.8


In [39]:
#Checking if any of the columns have missing values
barcelona_houses_2025_df.isnull().sum()

Year              0
Cod_district      0
Name_district     0
Code_barrio       0
Name_barrio       0
Seccion_censal    0
Sup_media_m2      0
dtype: int64

In [92]:
#Check how many Districts and Barrios (neighborhoods) we have in the csv
print(f"Districts found: {barcelona_houses_2025_df['Name_district'].nunique()}")
print(f"Neighborhoods found: {barcelona_houses_2025_df['Name_barrio'].nunique()}")

#Lets also check the amount of Neighborhoods in each District and sort them from the largest to the smallest
neighborhoods_per_district = barcelona_houses_2025_df.groupby(['Code_district', 'Name_district'])['Name_barrio'].nunique().reset_index()
neighborhoods_per_district = neighborhoods_per_district.sort_values(by='Name_barrio', ascending=False)
neighborhoods_per_district.rename(columns={'Name_barrio': "Num_of_barrios"})

Districts found: 10
Neighborhoods found: 73


Unnamed: 0,Code_district,Name_district,Num_of_barrios
7,8,Nou Barris,13
6,7,Horta-Guinardó,11
9,10,Sant Martí,10
2,3,Sants-Montjuïc,8
8,9,Sant Andreu,7
1,2,L'Eixample,6
4,5,Sarrià-Sant Gervasi,6
5,6,Gràcia,5
0,1,Ciutat Vella,4
3,4,Les Corts,3


### 4. Create 2 new useful or interesting columns based on other existing columns in the dataset (e.g., a ratio, a category based on numerical values, etc.).

In [81]:
#New column 1: Lets cassify the properties by "Small", "Medium" and "Large" based on the surface area.
def property_size (m2):
    if m2 < 60: 
        return 'Small'
    elif 60 <= m2 <= 90: 
        return 'Medium'
    else: 
        return 'Large'

barcelona_houses_2025_df['Size_category'] = barcelona_houses_2025_df['Sup_media_m2'].apply(property_size)

#New column 2: Lets get the averge size of all the properties in Barcelona and store in the new column a boolean "True" if the 'Sup_media_m2' larger and "False" if its smaller than the city average
city_property_avg = barcelona_houses_2025_df['Sup_media_m2'].mean()

barcelona_houses_2025_df['Is_above_avg'] = barcelona_houses_2025_df['Sup_media_m2'] > city_property_avg

barcelona_houses_2025_df[['Name_barrio', 'Sup_media_m2', 'Size_category', 'Is_above_avg']].sort_values('Is_above_avg', ascending=False)

Unnamed: 0,Name_barrio,Sup_media_m2,Size_category,Is_above_avg
534,la Vila de Gràcia,93.7,Large,True
373,les Corts,92.9,Large,True
406,Sarrià,146.6,Large,True
405,Sarrià,78.0,Medium,True
404,Sarrià,111.3,Large,True
...,...,...,...,...
550,la Vila de Gràcia,68.2,Medium,False
551,la Vila de Gràcia,65.4,Medium,False
552,la Vila de Gràcia,69.2,Medium,False
553,la Vila de Gràcia,58.9,Small,False
