In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from splinter import Browser

In [2]:
# Send a GET request to the website
url = "https://aqicn.org/city/all/"

response = requests.get(url)

In [3]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

In [4]:
# The "Major Cities" are under the "div" element inside the "main-cities" class
container = soup.find("div", class_="main-cities")

In [5]:
# The <a> element represents the cities. 
city_elements = container.find_all("a")

In [6]:
# Extracts the text from each city element, removes leading and trailing spaces using .strip()
cities = [city_element.text.strip() for city_element in city_elements]

In [7]:
cities_df = pd.DataFrame(cities, columns=["City Name"])
cities_df

Unnamed: 0,City Name
0,Shanghai (上海)
1,Beijing (北京)
2,Tianjin (天津)
3,Guangzhou (广州市)
4,Shenzhen (深圳)
...,...
528,Addis Ababa (አዲስ አበባ)
529,Nairobi
530,Kampala (كامبالا)
531,Algiers (الجزائر)


In [8]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 533 entries, 0 to 532
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   City Name  533 non-null    object
dtypes: object(1)
memory usage: 4.3+ KB


In [9]:
#Removing parentheses form the "City Name" column
cities_df['City Name'] = cities_df['City Name'].str.replace(r'\s+\(.*\)', '', regex=True)
cities_df

Unnamed: 0,City Name
0,Shanghai
1,Beijing
2,Tianjin
3,Guangzhou
4,Shenzhen
...,...
528,Addis Ababa
529,Nairobi
530,Kampala
531,Algiers


In [10]:
cities_list = cities_df["City Name"].tolist()
print(cities_list)

['Shanghai', 'Beijing', 'Tianjin', 'Guangzhou', 'Shenzhen', 'Wuhan', 'Dongguan', 'Chongqing', 'Chengdu', 'Nanjing', 'Taipei', 'Kaohsiung', 'Taichung', 'tainan', 'banqiao', 'hsinchu', 'taoyuan-city', 'Keelung', 'Hong Kong', 'Macao', 'Hanyang', 'Busan', 'Incheon', 'Daejeon', 'Ulsan', 'Daegu', 'Gwangju', 'Suwon', 'Goyang', 'Seongnam', 'Edo', 'Yokohama', 'Osaka', 'Nagoya', 'Sapporo', 'Kobe', 'Kyoto', 'Fukuoka', 'Kawasaki', 'saitama', 'Moscow', 'Krasnoyarsk', 'Kaliningrad', 'Leningrad', 'Novo-Nikolaevsk', 'Nizhniy Novgorod', 'Chelyabinsk', 'Ufa', 'Dhaka', 'Kathmandu', 'Pokhara', 'Patan', 'biratnagar', 'Birgunj', 'dharan-bazar', 'Bharatpur', 'Bombay', 'Delhi', 'Bangalore', 'Calcutta', 'Chennai', 'Ahmedabad', 'Hyderabad', 'Pune', 'Kanpur', 'Bangkok', 'mueang-samut-prakan', 'Nonthaburi', 'chon-buri', 'Nakhon Ratchasima', 'Chiangmai', 'Hat Yai', 'Pak Kret', 'si-racha', 'Amphoe Phra Pradaeng', 'Lampang', 'surin', 'Vientiane', 'Rangoon', 'Kota Bharu', 'Kuala Lumpur', 'klang', 'kampung-baru-subang

In [11]:
import json
import time
from pymongo import MongoClient

In [15]:
client = MongoClient('mongodb://localhost:27017/')
db = client['air_quality_status']
collection = db['air_quality_status']

base_url_cities_1 = "/?token=cfec51e63370e90b7880950705e8add14a2602b6"
base_url_cities_0 = "http://api.airvisual.com/v2/city?city="

for x in cities_list:
    base_url_cities = base_url_cities_0 + x + base_url_cities_1
    response_cities = requests.get(base_url_cities).json()
    #print(json.dumps(response_cities, indent=4))
    #Insert the data into the collection
    if response_cities['status']=='ok':
        collection.insert_one(response_cities)
client.close()