# This program aims to check the DataFrame structure, transfer the data type, drop the useless columns, and save each county csv file.

In [None]:
import pandas as pd 
import os

In [None]:
# Transfer to DataFrame
path = "../data/raw/air_quality.csv"
df = pd.read_csv(path, low_memory=False)

# check the structure of df
df.info()

In [None]:
# Transfer the air element columns' type to float
numeric_col = [
    "so2",
    "co",
    "o3",
    "o3_8hr",
    "pm10",
    "pm2.5",
    "no2",
    "nox",
    "no",
    "windspeed",
    "winddirec",
    "co_8hr",
    "pm2.5_avg",
    "pm10_avg",
    "so2_avg",
]

df[numeric_col] = df[numeric_col].apply(pd.to_numeric, errors="coerce")
df.info()

In [None]:
# check the Nan amount in every column
df.isna().sum()

In [None]:
# Column unit has 5,882,208 Nan so we can drop it off
df = df.drop(columns=["unit"])

In [None]:
# We don't need longitude and latitude this time, so let's drop them off.
df = df.drop(columns=["longitude", "latitude"])

In [None]:
# replace the space between county name with _
df["county"] = df["county"].str.replace(" ", "_")

In [None]:
# Get the county name
county_list = list(df["county"].str.strip().unique())

# Store the csv file according to the county
for county in county_list:
    temp = df[df["county"] == county]
    path = f"../data/processed/{county}.csv"
    os.makedirs(os.path.dirname(path), exist_ok=True)
    temp.to_csv(path, index=False)
    print(f"{county}的CSV檔已儲存完畢")