<a href="https://colab.research.google.com/github/drfperez/openair/blob/main/2026/Gencat2Openair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from google.colab import files

# ==================================================
# 1. C√ÄRREGA ROBUSTA DEL CSV
# ==================================================
print("Carrega el teu fitxer CSV:")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

df = pd.read_csv(
    file_name,
    sep=None,               # autodetecci√≥ real
    engine='python',
    decimal=',',
    encoding='utf-8-sig'
)

if df.shape[1] == 1:
    raise ValueError("‚ùå El CSV s'ha carregat com una sola columna.")

print(f"‚úî CSV carregat: {df.shape[0]} files, {df.shape[1]} columnes")

# ==================================================
# 2. NORMALITZAR I FER √öNIQUES LES COLUMNES
# ==================================================
df.columns = df.columns.str.strip()

# Fer els noms √∫nics (molt important!)
seen = {}
new_cols = []
for col in df.columns:
    if col in seen:
        seen[col] += 1
        new_cols.append(f"{col}_{seen[col]}")
    else:
        seen[col] = 0
        new_cols.append(col)

df.columns = new_cols

# ==================================================
# 3. DETECCI√ì CORRECTA DE COLUMNES CLAU
# ==================================================

# --- columna de data: ha de contenir timestamps ISO
data_col = None
for col in df.columns:
    if df[col].dtype == object:
        if df[col].str.contains(r"\d{4}-\d{2}-\d{2}", na=False).any():
            data_col = col
            break

if data_col is None:
    raise ValueError("‚ùå No s'ha pogut detectar la columna de data.")

# --- columna de contaminant: NOX, CO, O3...
contaminant_col = None
for col in df.columns:
    if df[col].dtype == object:
        if df[col].str.match(r"^[A-Z]{1,5}$", na=False).any():
            contaminant_col = col
            break

if contaminant_col is None:
    raise ValueError("‚ùå No s'ha pogut detectar la columna de contaminant.")

print(f"‚úî Columna data detectada: {data_col}")
print(f"‚úî Columna contaminant detectada: {contaminant_col}")

df = df.rename(columns={
    data_col: 'data',
    contaminant_col: 'contaminant'
})

# ==================================================
# 4. CONVERSI√ì DE DATA (ROBUSTA)
# ==================================================
df['data'] = pd.to_datetime(df['data'], errors='coerce')

if df['data'].isna().all():
    raise ValueError("‚ùå Cap data v√†lida despr√©s de la conversi√≥.")

# ==================================================
# 5. VALIDACI√ì I NETEJA D‚ÄôHORES
# ==================================================
hores = [f"h{i:02}" for i in range(1, 25)]

missing = [h for h in hores if h not in df.columns]
if missing:
    raise ValueError(f"‚ùå Falten columnes d'hores: {missing}")

df[hores] = df[hores].apply(pd.to_numeric, errors='coerce')

# ==================================================
# 6. EXPANDIR HORES ‚Üí FILES
# ==================================================
df_long = df.melt(
    id_vars=['data', 'contaminant'],
    value_vars=hores,
    var_name='hora',
    value_name='valor'
)

df_long['hora'] = df_long['hora'].str[1:].astype(int) - 1
df_long['data'] = df_long['data'] + pd.to_timedelta(df_long['hora'], unit='h')
df_long = df_long.drop(columns='hora')

df_long = df_long.dropna(subset=['valor'])
if df_long.empty:
    raise ValueError("‚ùå No hi ha dades hor√†ries.")

# ==================================================
# 7. PIVOT ROBUST
# ==================================================
pivot_df = df_long.pivot_table(
    index='data',
    columns='contaminant',
    values='valor',
    aggfunc='mean'
).reset_index()

# ==================================================
# 8. SORTIDA FINAL
# ==================================================
pivot_df = pivot_df.sort_values('data')
pivot_df['data'] = pivot_df['data'].dt.strftime('%Y-%m-%d %H:%M:%S')

pivot_df.columns = [
    'date' if col == 'data' else col.lower()
    for col in pivot_df.columns
]

# ==================================================
# 9. GUARDAR RESULTAT
# ==================================================
output_file = 'processed_data.csv'
pivot_df.to_csv(output_file, index=False)

files.download(output_file)

print("‚úÖ PROC√âS COMPLETAT CORRECTAMENT")
print(f"üìÅ Fitxer: {output_file}")
print(f"üìä Files finals: {pivot_df.shape[0]}")
print(f"üìä Columnes finals: {pivot_df.shape[1]}")


Carrega el teu fitxer CSV:


Saving gava.csv to gava (5).csv
‚úî CSV carregat: 43650 files, 40 columnes
‚úî Columna data detectada: data
‚úî Columna contaminant detectada: contaminant


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ PROC√âS COMPLETAT CORRECTAMENT
üìÅ Fitxer: processed_data.csv
üìä Files finals: 174792
üìä Columnes finals: 8


In [1]:

import pandas as pd
from google.colab import files

# Pujar el fitxer CSV
print("Carrega el teu fitxer CSV:")
uploaded = files.upload()

# Carregar el fitxer CSV
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

# Comprovar les columnes necess√†ries
required_columns = ['data', 'contaminant', 'h01', 'h02', 'h03', 'h04', 'h05', 'h06', 'h07', 'h08',
                    'h09', 'h10', 'h11', 'h12', 'h13', 'h14', 'h15', 'h16', 'h17', 'h18', 'h19',
                    'h20', 'h21', 'h22', 'h23', 'h24']
if not all(col in df.columns for col in required_columns):
    raise ValueError("El fitxer no cont√© les columnes requerides.")

# Expandir les hores en files amb un format de temps
df = df.melt(id_vars=['data', 'contaminant'],
             value_vars=[f"h{i:02}" for i in range(1, 25)],
             var_name='hora',
             value_name='valor')

# Convertir la columna 'hora' en un format d'hora real
df['hora'] = df['hora'].str.extract('(\d+)').astype(int) - 1  # Convertir 'h01', 'h02', ... a 0-23
df['data'] = pd.to_datetime(df['data'], errors='coerce') + pd.to_timedelta(df['hora'], unit='h')

# Eliminar la columna 'hora' i reestructurar les dades
df = df.drop(columns=['hora'])

# Seleccionar nom√©s les columnes necess√†ries (data, contaminant i les hores expandides)
df = df[['data', 'contaminant', 'valor']]

# Pivotar el DataFrame per tenir una columna per cada contaminant
pivot_df = df.pivot(index='data', columns='contaminant', values='valor').reset_index()

# Ordenar per data
pivot_df = pivot_df.sort_values(by='data')

# Convertir les dates a format amb espai (en lloc de T) per ISO abans de guardar el CSV
pivot_df['data'] = pivot_df['data'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Renombrar la columna 'data' com 'date'
pivot_df = pivot_df.rename(columns={'data': 'date'})

# Convertir els noms dels contaminants a min√∫scules
pivot_df.columns = [col.lower() if isinstance(col, str) and col != 'date' else col for col in pivot_df.columns]

# Substituir els valors buits (NaN) per 'NA'
pivot_df = pivot_df.fillna('NA')

# Guardar el fitxer CSV processat
output_file = 'processed_data.csv'
pivot_df.to_csv(output_file, index=False)

# Descarregar el fitxer processat
files.download(output_file)
print(f"El fitxer processat s'ha desat com: {output_file}")

  df['hora'] = df['hora'].str.extract('(\d+)').astype(int) - 1  # Convertir 'h01', 'h02', ... a 0-23


Carrega el teu fitxer CSV:


Saving gava.csv to gava.csv


ValueError: Index contains duplicate entries, cannot reshape