# Air Quality in Beijing

## Initial Data Exploration and Cleaning

In [None]:
#-----------------------------------------------------------------------------------------
# Name:        Practice Day 7
# Author:      Erick Rico
# Created:     09/08/2025
#-----------------------------------------------------------------------------------------

#### Configuración del entorno:

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("data-pm2.5.csv")

#### Conversión a NumPy:

In [4]:
df.columns

Index(['No', 'year', 'month', 'day', 'hour', 'pm2.5', 'DEWP', 'TEMP', 'PRES',
       'cbwd', 'Iws', 'Is', 'Ir'],
      dtype='object')

In [8]:
arr_pm = df["pm2.5"].values
arr_pm

array([nan, nan, nan, ..., 10.,  8., 12.])

In [9]:
arr_temp = df["TEMP"].to_numpy()
arr_temp

array([-11., -12., -11., ...,  -3.,  -4.,  -3.])

#### Cálculos Vectorizados (sin bucles):

In [12]:
print(np.nanmean(arr_pm))
print(np.nanstd(arr_pm))
print(np.nanmax(arr_pm))

98.61321455085375
92.04928496759753
994.0


In [11]:
df["pm2.5"].describe()

count    41757.000000
mean        98.613215
std         92.050387
min          0.000000
25%         29.000000
50%         72.000000
75%        137.000000
max        994.000000
Name: pm2.5, dtype: float64

##### Normalización:

In [25]:
(arr_pm - np.nanmin(arr_pm)) / (np.nanmax(arr_pm) - np.nanmin(arr_pm))

array([       nan,        nan,        nan, ..., 0.01006036, 0.00804829,
       0.01207243])

In [14]:
arr_pm_clean = arr_pm[~np.isnan(arr_pm)]

In [24]:
norm_pm = (arr_pm_clean - np.min(arr_pm_clean)) / (np.max(arr_pm_clean) - np.min(arr_pm_clean))
norm_pm

array([0.12977867, 0.14889336, 0.15995976, ..., 0.01006036, 0.00804829,
       0.01207243])

##### Comparación:

In [16]:
filtro = arr_pm_clean > 50
filtro

array([ True,  True,  True, ..., False, False, False])

In [18]:
arr_pm_mayor_50 = arr_pm_clean[filtro]
arr_pm_mayor_50

array([129., 148., 159., ...,  60.,  63.,  79.])

#### Comparación de rendimiento:

In [23]:
%%timeit
i = 0
for j in arr_pm_mayor_50:
    i += j
avg = i / len(arr_pm_mayor_50)
avg

2.22 ms ± 183 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
%timeit np.mean(arr_pm_mayor_50)

12 μs ± 849 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
