<a href="https://colab.research.google.com/github/drfperez/openair/blob/main/2026/Gencat2Openair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
from google.colab import files

# ==================================================
# 1. C√ÄRREGA ROBUSTA DEL CSV
# ==================================================
print("Carrega el teu fitxer CSV:")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

df = pd.read_csv(
    file_name,
    sep=None,
    engine='python',
    decimal=',',
    encoding='utf-8-sig'
)

if df.shape[1] == 1:
    raise ValueError("‚ùå El CSV s'ha carregat com una sola columna.")

print(f"‚úî CSV carregat: {df.shape[0]} files, {df.shape[1]} columnes")

# ==================================================
# 2. NORMALITZAR I FER √öNIQUES LES COLUMNES
# ==================================================
df.columns = df.columns.str.strip()
seen = {}
new_cols = []
for col in df.columns:
    if col in seen:
        seen[col] += 1
        new_cols.append(f"{col}_{seen[col]}")
    else:
        seen[col] = 0
        new_cols.append(col)
df.columns = new_cols

# ==================================================
# 3. DETECCI√ì COLUMNES CLAU
# ==================================================
data_col = None
for col in df.columns:
    if df[col].dtype == object and df[col].str.contains(r"\d{4}-\d{2}-\d{2}", na=False).any():
        data_col = col
        break
if data_col is None:
    raise ValueError("‚ùå No s'ha pogut detectar la columna de data.")

contaminant_col = None
for col in df.columns:
    if df[col].dtype == object and df[col].str.match(r"^[A-Z]{1,5}$", na=False).any():
        contaminant_col = col
        break
if contaminant_col is None:
    raise ValueError("‚ùå No s'ha pogut detectar la columna de contaminant.")

print(f"‚úî Columna data detectada: {data_col}")
print(f"‚úî Columna contaminant detectada: {contaminant_col}")

df = df.rename(columns={data_col: 'data', contaminant_col: 'pollutant'})

# ==================================================
# 4. CONVERSI√ì A DATETIME
# ==================================================
df['data'] = pd.to_datetime(df['data'], errors='coerce')
df['data'] = df['data'].fillna(pd.Timestamp('1900-01-01'))

# ==================================================
# 5. VALIDACI√ì HORES
# ==================================================
hores = [f"h{i:02}" for i in range(1, 25)]
for h in hores:
    if h not in df.columns:
        df[h] = 'NA'
df[hores] = df[hores].apply(pd.to_numeric, errors='coerce')

# ==================================================
# 6. EXPANDIR HORES
# ==================================================
df_long = df.melt(
    id_vars=['data', 'pollutant'],
    value_vars=hores,
    var_name='hora',
    value_name='value'
)

df_long['hora'] = df_long['hora'].str.extract(r'(\d+)').astype(int) - 1
df_long['data'] = df_long['data'] + pd.to_timedelta(df_long['hora'], unit='h')
df_long = df_long.drop(columns='hora')

df_long['value'] = df_long['value'].fillna('NA')

# ==================================================
# 7. PIVOT WIDE (date, ordre decreixent)
# ==================================================
pivot_wide = df_long.pivot_table(
    index='data',
    columns='pollutant',
    values='value',
    aggfunc=lambda x: x.mean() if pd.api.types.is_numeric_dtype(x) else x.iloc[0],
    fill_value='NA'
).reset_index()

pivot_wide = pivot_wide.sort_values('data', ascending=False)
pivot_wide.columns = ['date' if col == 'data' else col.lower() for col in pivot_wide.columns]

# ==================================================
# 8. PIVOT LONG (date, ordre decreixent)
# ==================================================
pivot_long = df_long.copy()
pivot_long['pollutant'] = pivot_long['pollutant'].str.lower()
pivot_long = pivot_long[['data', 'pollutant', 'value']]
pivot_long = pivot_long.rename(columns={'data': 'date'})
pivot_long = pivot_long.sort_values('date', ascending=False)

# ==================================================
# 9. GUARDAR RESULTATS
# ==================================================
wide_file = 'processed_data_wide.csv'
long_file = 'processed_data_long.csv'

pivot_wide.to_csv(wide_file, index=False)
pivot_long.to_csv(long_file, index=False)

files.download(wide_file)
files.download(long_file)

print("‚úÖ PROC√âS COMPLETAT CORRECTAMENT")
print(f"üìÅ Fitxer wide: {wide_file}, {pivot_wide.shape[0]} files, {pivot_wide.shape[1]} columnes")
print(f"üìÅ Fitxer long: {long_file}, {pivot_long.shape[0]} files, {pivot_long.shape[1]} columnes")