<a href="https://colab.research.google.com/github/drfperez/openair/blob/main/2026/Gencat2Openair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from google.colab import files
import zipfile

# ==================================================
# 1. C√ÄRREGA ROBUSTA DEL CSV
# ==================================================
print("Carrega el teu fitxer CSV:")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

df = pd.read_csv(
    file_name,
    sep=None,
    engine='python',
    decimal=',',
    encoding='utf-8-sig'
)

if df.shape[1] == 1:
    raise ValueError("‚ùå El CSV s'ha carregat com una sola columna.")

print(f"‚úî CSV carregat: {df.shape[0]} files, {df.shape[1]} columnes")

# ==================================================
# 2. NORMALITZAR COLUMNES
# ==================================================
df.columns = df.columns.str.strip()
seen = {}
new_cols = []
for col in df.columns:
    if col in seen:
        seen[col] += 1
        new_cols.append(f"{col}_{seen[col]}")
    else:
        seen[col] = 0
        new_cols.append(col)
df.columns = new_cols

# ==================================================
# 3. DETECCI√ì COLUMNES CLAU
# ==================================================
data_col = None
for col in df.columns:
    if df[col].dtype == object and df[col].str.contains(r"\d{4}-\d{2}-\d{2}", na=False).any():
        data_col = col
        break

contaminant_col = None
for col in df.columns:
    if df[col].dtype == object and df[col].str.match(r"^[A-Z]{1,5}$", na=False).any():
        contaminant_col = col
        break

df = df.rename(columns={data_col: 'data', contaminant_col: 'pollutant'})

# ==================================================
# 4. CONVERSI√ì A DATETIME
# ==================================================
df['data'] = pd.to_datetime(df['data'], errors='coerce')
df['data'] = df['data'].fillna(pd.Timestamp('1900-01-01'))

# ==================================================
# 5. VALIDACI√ì HORES
# ==================================================
hores = [f"h{i:02}" for i in range(1, 25)]
for h in hores:
    if h not in df.columns:
        df[h] = 'NA'
df[hores] = df[hores].apply(pd.to_numeric, errors='coerce')

# ==================================================
# 6. EXPANDIR HORES
# ==================================================
df_long = df.melt(
    id_vars=['data', 'pollutant'],
    value_vars=hores,
    var_name='hora',
    value_name='value'
)

df_long['hora'] = df_long['hora'].str.extract(r'(\d+)').astype(int) - 1
df_long['data'] = df_long['data'] + pd.to_timedelta(df_long['hora'], unit='h')
df_long = df_long.drop(columns='hora')
df_long['value'] = df_long['value'].fillna('NA')

# ==================================================
# 7. PIVOT WIDE (date, decreixent)
# ==================================================
pivot_wide = df_long.pivot_table(
    index='data',
    columns='pollutant',
    values='value',
    aggfunc=lambda x: x.mean() if pd.api.types.is_numeric_dtype(x) else x.iloc[0],
    fill_value='NA'
).reset_index()

pivot_wide = pivot_wide.sort_values('data', ascending=False)
pivot_wide.columns = ['date' if c == 'data' else c.lower() for c in pivot_wide.columns]

# ==================================================
# 8. PIVOT LONG (date, decreixent)
# ==================================================
pivot_long = df_long.copy()
pivot_long['pollutant'] = pivot_long['pollutant'].str.lower()
pivot_long = pivot_long[['data', 'pollutant', 'value']]
pivot_long = pivot_long.rename(columns={'data': 'date'})
pivot_long = pivot_long.sort_values('date', ascending=False)

# ==================================================
# 9. GUARDAR CSV
# ==================================================
wide_file = 'processed_data_wide.csv'
long_file = 'processed_data_long.csv'

pivot_wide.to_csv(wide_file, index=False)
pivot_long.to_csv(long_file, index=False)

# ==================================================
# 10. ZIP (SOLUCI√ì COLAB)
# ==================================================
zip_name = 'processed_results.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as z:
    z.write(wide_file)
    z.write(long_file)

files.download(zip_name)

print("‚úÖ PROC√âS COMPLETAT")
print("üì¶ Descarregat: processed_results.zip (wide + long)")

Carrega el teu fitxer CSV:


Saving vallhebron.csv to vallhebron (1).csv
‚úî CSV carregat: 41898 files, 40 columnes
‚úî Columna data detectada: data
‚úî Columna contaminant detectada: contaminant


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ PROC√âS COMPLETAT CORRECTAMENT
üìÅ Fitxer wide: processed_data_wide.csv, 157128 files, 9 columnes
üìÅ Fitxer long: processed_data_long.csv, 1005552 files, 3 columnes
