In [12]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

In [2]:
# The URL of the web page for scrape 534 city name
url = "https://aqicn.org/city/all/"

# Send a GET request to the website
response = requests.get(url)

In [3]:
# Parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find the container that holds the city names
container = soup.find("div", class_="main-cities")

# Find all <a> elements, which contain the city names
city_elements = container.find_all("a")

# Extract the city names from the <a> elements and strip leading/trailing whitespace
cities = [city_element.text.strip() for city_element in city_elements]

# Create a DataFrame to hold the city names
cities_df = pd.DataFrame(cities, columns=["City Name"])
cities_df

Unnamed: 0,City Name
0,Shanghai (上海)
1,Beijing (北京)
2,Tianjin (天津)
3,Guangzhou (广州市)
4,Shenzhen (深圳)
...,...
528,Addis Ababa (አዲስ አበባ)
529,Nairobi
530,Kampala (كامبالا)
531,Algiers (الجزائر)


In [4]:
# Clean the City Name
cities_df['City Name'] = cities_df['City Name'].str.replace(r'\s+\(.*\)', '', regex=True)
cities_df

Unnamed: 0,City Name
0,Shanghai
1,Beijing
2,Tianjin
3,Guangzhou
4,Shenzhen
...,...
528,Addis Ababa
529,Nairobi
530,Kampala
531,Algiers


In [5]:
# Convert 534 cities to the list
cities_list = cities_df["City Name"].tolist()
print(cities_list)

['Shanghai', 'Beijing', 'Tianjin', 'Guangzhou', 'Shenzhen', 'Wuhan', 'Dongguan', 'Chongqing', 'Chengdu', 'Nanjing', 'Taipei', 'Kaohsiung', 'Taichung', 'tainan', 'banqiao', 'hsinchu', 'taoyuan-city', 'Keelung', 'Hong Kong', 'Macao', 'Hanyang', 'Busan', 'Incheon', 'Daejeon', 'Ulsan', 'Daegu', 'Gwangju', 'Suwon', 'Goyang', 'Seongnam', 'Edo', 'Yokohama', 'Osaka', 'Nagoya', 'Sapporo', 'Kobe', 'Kyoto', 'Fukuoka', 'Kawasaki', 'saitama', 'Moscow', 'Krasnoyarsk', 'Kaliningrad', 'Leningrad', 'Novo-Nikolaevsk', 'Nizhniy Novgorod', 'Chelyabinsk', 'Ufa', 'Dhaka', 'Kathmandu', 'Pokhara', 'Patan', 'biratnagar', 'Birgunj', 'dharan-bazar', 'Bharatpur', 'Bombay', 'Delhi', 'Bangalore', 'Calcutta', 'Chennai', 'Ahmedabad', 'Hyderabad', 'Pune', 'Kanpur', 'Bangkok', 'mueang-samut-prakan', 'Nonthaburi', 'chon-buri', 'Nakhon Ratchasima', 'Chiangmai', 'Hat Yai', 'Pak Kret', 'si-racha', 'Amphoe Phra Pradaeng', 'Lampang', 'surin', 'Vientiane', 'Rangoon', 'Kota Bharu', 'Kuala Lumpur', 'klang', 'kampung-baru-subang

In [11]:
# The beginning and end parts of the URL to access the API for each city
base_url_cities_0 = "http://api.waqi.info/feed/"
base_url_cities_1 = "/?token=cfec51e63370e90b7880950705e8add14a2602b6"

# Define an empty dictionary to store the results
results = {}

# For each city in the list, send a GET request to the API and store the response in the dictionary
for city in cities_list:
    try:
        base_url_cities = base_url_cities_0 + city + base_url_cities_1
        response_cities = requests.get(base_url_cities)
        
        # Raise an exception if the request was unsuccessful
        response_cities.raise_for_status()
        
        # Store the JSON response in the dictionary
        results[city] = response_cities.json()
        
        # Wait for 1 second before sending the next request, to avoid overloading the server
        time.sleep(1)
    except requests.exceptions.HTTPError as err:
        # Print an error message if a city's data could not be retrieved
        print(f"HTTP error occurred for {city}: {err}")

# Save the results to a JSON file
with open("aqi_data_indent1.json", "w") as json_file:
    json.dump(results, json_file, indent=1)

In [13]:
# Load the JSON file if it is correct
with open('aqi_data_indent1.json', 'r') as f:
    data = json.load(f)
data

{'Shanghai': {'status': 'ok',
  'data': {'aqi': 80,
   'idx': 1437,
   'attributions': [{'url': 'https://china.usembassy-china.org.cn/embassy-consulates/shanghai/air-quality-monitor-stateair/',
     'name': 'U.S. Consulate Shanghai Air Quality Monitor'},
    {'url': 'https://sthj.sh.gov.cn/',
     'name': 'Shanghai Environment Monitoring Center(上海市环境监测中心)'},
    {'url': 'http://106.37.208.233:20035/emcpublish/',
     'name': 'China National Urban air quality real-time publishing platform (全国城市空气质量实时发布平台)'},
    {'url': 'https://waqi.info/', 'name': 'World Air Quality Index Project'}],
   'city': {'geo': [31.2047372, 121.4489017],
    'name': 'Shanghai (上海)',
    'url': 'https://aqicn.org/city/shanghai',
    'location': ''},
   'dominentpol': 'pm25',
   'iaqi': {'co': {'v': 6.8},
    'h': {'v': 94},
    'no2': {'v': 17},
    'o3': {'v': 20.4},
    'p': {'v': 1013},
    'pm10': {'v': 38},
    'pm25': {'v': 80},
    'so2': {'v': 3.6},
    't': {'v': 23},
    'w': {'v': 1.5}},
   'time': {

In [14]:
# Check if we have at least 500 row of data
num_cities = len(data)
print(f"There are {num_cities} cities in the JSON file.")

There are 532 cities in the JSON file.


In [15]:
status_set = set()

for city in data:
    if 'status' in data[city]:  # ensure that the 'status' key exists
        status_set.add(data[city]['status'])

status_set


{'error', 'nope', 'ok'}

In [16]:
from collections import defaultdict

status_counts = defaultdict(int)  # initialize counter with 0 as default value for non-existing keys

for city_data in data.values():
    status_counts[city_data['status']] += 1

for status, count in status_counts.items():
    print(f"There are {count} cities with status '{status}'")


There are 497 cities with status 'ok'
There are 31 cities with status 'error'
There are 4 cities with status 'nope'


In [20]:
import datetime

# Date to check
check_date = datetime.datetime.strptime("2023-05-28", "%Y-%m-%d")

# Dictionary to hold the cleaned data
cleaned_data = {}

# Iterate through the cities in the data
for city_name, city_info in data.items():
    # Check status
    if city_info['status'] != 'ok':
        continue

    # Check if the 'daily' key exists
    if 'forecast' in city_info['data'] and 'daily' in city_info['data']['forecast']:
        forecast = city_info['data']['forecast']['daily']
        for pollutant in forecast:
            # If any forecast date is later than check_date, remove it
            forecast[pollutant] = [day for day in forecast[pollutant] if datetime.datetime.strptime(day['day'], "%Y-%m-%d") <= check_date]
    else:
        # If there's no 'daily' key, skip this city
        continue

    cleaned_data[city_name] = city_info

# Save the cleaned data to a new file
with open('aqi_data_ok_status.json', 'w') as f:
    json.dump(cleaned_data, f, indent=4)

In [28]:
#ok_status_data = {city: details for city, details in data.items() if details['status'] == 'ok'}

#with open('aqi_data_ok_status.json', 'w') as f:
    #json.dump(ok_status_data, f, indent=4)