In [4]:
import pandas as pd
from thefuzz import process
import json
from deep_translator import GoogleTranslator

In [5]:
world_cities_df = pd.read_csv("worldcities.csv")
world_cities_df = world_cities_df[["country", "city", "lat", "lng"]]
world_cities_df["country"] = world_cities_df["country"].map(str.lower).map(str.strip)
world_cities_df["city"] = world_cities_df["city"].map(str.lower).map(str.strip)
world_cities_df.columns = ["country", "city", "latitude", "longitude"]
world_cities_df = world_cities_df.drop_duplicates()

In [12]:
global_air_pollution_df = pd.read_csv("global-air-pollution-dataset.csv")

global_air_pollution_df = global_air_pollution_df[["Country", "City", "AQI Value", "AQI Category", "Ozone AQI Value", "Ozone AQI Category"]]
global_air_pollution_df.columns = ["country", "city", "aqi_value", "aqi_category", "ozone_aqi_value", "ozone_aqi_category"]
global_air_pollution_df = global_air_pollution_df[global_air_pollution_df['country'].notna()]


global_air_pollution_df["city"] = global_air_pollution_df["city"].map(str).map(str.lower) 
global_air_pollution_df["aqi_category"] = global_air_pollution_df["aqi_category"].map(str.lower) 
global_air_pollution_df["ozone_aqi_category"] = global_air_pollution_df["ozone_aqi_category"].map(str.lower) 

In [7]:
def normalize_city_name(name: str, feature: str):
    unique_values = world_cities_df[feature].unique()
    extracted_names = process.extract(name, unique_values, limit=1)
    if len(extracted_names) == 0: return None
    return extracted_names[0][0]
    
new_cities = global_air_pollution_df["city"].map(lambda name: normalize_city_name(name, "city"))
new_countries = global_air_pollution_df["country"].map(lambda name: normalize_city_name(name, "country"))

In [8]:
global_air_pollution_df['country'] = new_countries
global_air_pollution_df['city'] = new_cities

In [9]:
global_air_pollution_df.loc[global_air_pollution_df["aqi_category"] == "unhealthy for sensitive groups", "aqi_category"] = "unhealthy"
global_air_pollution_df.loc[global_air_pollution_df["aqi_category"] == "very unhealthy", "aqi_category"] = "unhealthy"
global_air_pollution_df.loc[global_air_pollution_df["aqi_category"] == "hazardous", "aqi_category"] = "dangerous"

global_air_pollution_df.loc[global_air_pollution_df["ozone_aqi_category"] == "unhealthy for sensitive groups", "ozone_aqi_category"] = "unhealthy"
global_air_pollution_df.loc[global_air_pollution_df["ozone_aqi_category"] == "very unhealthy", "ozone_aqi_category"] = "unhealthy"

In [10]:
formatted_data = pd.merge(world_cities_df, global_air_pollution_df, how="inner", left_on=['country', 'city'], right_on=['country', 'city'])

In [11]:
formatted_data["country"] = formatted_data["country"].map(lambda country: GoogleTranslator(source="auto", target="fr").translate(country))
formatted_data["city"] = formatted_data["city"].map(lambda city: GoogleTranslator(source="auto", target="fr").translate(city))

KeyboardInterrupt: 

In [351]:
data = [
    {
        "country": row[0], "city": row[1], "coordinates": [row[3], row[2]], 
        "aqi_value": row[4], "aqi_category": row[5], 
        "ozone_aqi_value": row[4], "ozone_aqi_category": row[5]
    }
    for row in formatted_data.values
]

In [352]:
with open("formatted_data.json", "w") as file:
    json_data = json.dumps(data)
    file.write(json_data)