In [10]:
import pandas as pd
import glob

# Step 1: Combine all CSV files
files = glob.glob("pm25/*.csv")  # Adjust the path to your files
all_data = []

for file in files:
    data = pd.read_csv(file)
    all_data.append(data)

combined_data = pd.concat(all_data, ignore_index=True)

# Step 2: Clean Data

# Remove duplicates (if any)
combined_data = combined_data.drop_duplicates()

# Handle missing values: You can either drop them or fill them with a value
combined_data = combined_data.dropna(subset=['mean'])

# Step 3: Convert 'mean' from kg/m³ to µg/m³
# Conversion: 1 kg/m³ = 1e9 µg/m³
combined_data['mean'] = combined_data['mean'] * 1e9

# Step 4: Format 'mean' values in a more readable decimal format
combined_data['mean'] = combined_data['mean'].apply(lambda x: round(x, 5))  # Round to 5 decimal places

# Step 5: Check the columns available in the DataFrame
print(combined_data.columns)

# Step 6: Rename '.geo' column to 'geometry' for clarity
combined_data['geometry'] = combined_data['.geo']

# Step 7: Select the columns you want to keep
columns_to_keep = ['LEVL_CODE', 'NUTS_NAME', 'NUTS_ID', 'mean', 'Shape_Area', 'Shape_Leng', 'month', 'year', 'geometry']

# Filter the data to keep only the selected columns
final_data = combined_data[columns_to_keep]

# Step 8: Export cleaned data to a new CSV file
final_data.to_csv('pm25.csv', index=False)

print("Data cleaned and saved as 'pm25.csv'")


Index(['system:index', 'CNTR_CODE', 'COAST_TYPE', 'LEVL_CODE', 'MOUNT_TYPE',
       'NAME_LATN', 'NUTS_ID', 'NUTS_NAME', 'Shape_Area', 'Shape_Leng',
       'URBN_TYPE', 'day', 'mean', 'month', 'start_date', 'year', '.geo'],
      dtype='object')
Data cleaned and saved as 'combined_cleaned_data.csv'
