In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from urllib.parse import quote, unquote
import json
from datetime import datetime
import pandas as pd
from tqdm.notebook import tqdm
import ipywidgets as widgets
from IPython.display import display
from multiprocessing import Process, Pool
from threading import Thread

In [2]:
firefox_options = webdriver.FirefoxOptions()
download_path = r"C:\Users\abhid\Downloads\Project\sites\2022-23\41-45"
firefox_options.set_preference("browser.download.alwaysOpenPanel", False)
firefox_options.set_preference("browser.download.folderList", 2)
firefox_options.set_preference("browser.download.dir", download_path)
# firefox_options.add_argument('-private')

In [3]:
parameters = {
    'parameter_list': [
        {'id': 0, 'itemName': 'PM2.5','itemValue': 'parameter_193'},
        {'id': 1, 'itemName': 'PM10', 'itemValue': 'parameter_215'},
        {'id': 2, 'itemName': 'NO', 'itemValue': 'parameter_226'},
        {'id': 3, 'itemName': 'NO2', 'itemValue': 'parameter_194'},
        {'id': 4, 'itemName': 'NOx', 'itemValue': 'parameter_225'},
        {'id': 5, 'itemName': 'NH3', 'itemValue': 'parameter_311'},
        {'id': 6, 'itemName': 'SO2', 'itemValue': 'parameter_312'},
        {'id': 7, 'itemName': 'CO', 'itemValue': 'parameter_203'},
        {'id': 8, 'itemName': 'Ozone', 'itemValue': 'parameter_222'},
        {'id': 9, 'itemName': 'Benzene', 'itemValue': 'parameter_202'},
        {'id': 10, 'itemName': 'Toluene', 'itemValue': 'parameter_232'},
        {'id': 11, 'itemName': 'Eth-Benzene', 'itemValue': 'parameter_216'},
        {'id': 12, 'itemName': 'MP-Xylene', 'itemValue': 'parameter_240'},
        {'id': 13, 'itemName': 'Temp', 'itemValue': 'parameter_198'},
        {'id': 14, 'itemName': 'RH', 'itemValue': 'parameter_235'},
        {'id': 15, 'itemName': 'WS', 'itemValue': 'parameter_233'},
        {'id': 16, 'itemName': 'WD', 'itemValue': 'parameter_234'},
        {'id': 17, 'itemName': 'SR', 'itemValue': 'parameter_237'},
        {'id': 18, 'itemName': 'BP', 'itemValue': 'parameter_238'},
        {'id': 19, 'itemName': 'VWS', 'itemValue': 'parameter_239'},
        {'id': 20, 'itemName': 'AT', 'itemValue': 'parameter_204'},
        {'id': 21, 'itemName': 'TOT-RF', 'itemValue': 'parameter_37'},
        {'id': 22, 'itemName': 'RF', 'itemValue': 'parameter_236'},
        {'id': 23, 'itemName': 'Xylene', 'itemValue': 'parameter_223'}
    ],
    'criteria': '1 Hours',
    'reportFormat': 'Tabular',
    'fromDate': '09-04-2022 T00:00:00Z',
    'toDate': '10-04-2023 T18:28:59Z',
    'state': 'Andhra Pradesh',
    'city': 'Amaravati',
    'station': 'site_1406',
    'parameter': [
        'parameter_193',
        'parameter_215',
        'parameter_226',
        'parameter_194',
        'parameter_225',
        'parameter_311',
        'parameter_312',
        'parameter_203',
        'parameter_222',
        'parameter_202',
        'parameter_232',
        'parameter_216',
        'parameter_240',
        'parameter_198',
        'parameter_235',
        'parameter_233',
        'parameter_234',
        'parameter_237',
        'parameter_238',
        'parameter_239',
        'parameter_204',
        'parameter_37',
        'parameter_236',
        'parameter_223'
    ],
    'parameterNames': [
        'PM2.5',
        'PM10',
        'NO',
        'NO2',
        'NOx',
        'NH3',
        'SO2',
        'CO',
        'Ozone',
        'Benzene',
        'Toluene',
        'Eth-Benzene',
        'MP-Xylene',
        'Temp',
        'RH',
        'WS',
        'WD',
        'SR',
        'BP',
        'VWS',
        'AT',
        'TOT-RF',
        'RF',
        'Xylene'
    ]
}

In [4]:
def format_date(date):
    return date.strftime('%d-%m-%Y T%H:%M:%SZ')

In [5]:
def get_input_json(state, city, station_id, from_date, to_date):
    parameters['state'] = state
    parameters['city'] = city
    parameters['station'] = station_id
    parameters['fromDate'] = format_date(from_date)
    parameters['toDate'] = format_date(to_date)
    return json.dumps(parameters).replace('"', '\\"')

In [6]:
# from_date = datetime(2020, 7, 1, 0, 0, 0)
# to_date = datetime(2022, 3, 31, 23, 59, 59)
from_date = datetime(2022, 4, 1, 0, 0, 0)
to_date = datetime(2023, 3, 31, 23, 59, 59)

In [7]:
report_url = 'https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-view-data-report/'

In [8]:
def encode(json_str):
    return quote(quote('"' + json_str + '"'))

In [9]:
def decode(json_str):
    return unquote(unquote(json_str))[1:-1]

In [10]:
def save_stations(file_name, out):
    stations = pd.read_csv(file_name)
    driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=firefox_options)
    driver.get('https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing')
    success_alert_selector = 'div.text-center:nth-child(4).alert-success'
    WebDriverWait(driver, timeout=1000).until(lambda d: d.find_element(By.CSS_SELECTOR, success_alert_selector))
    pbar = tqdm(stations.iterrows(), total=stations.shape[0], display=False, desc=file_name)
    out.append_display_data(pbar.container)
    
    for index, row in pbar:
        out.append_stdout(f"{index}, {row['state']}, {row['city']}, {row['station_id']}\n")
        retries = 1
        params = encode(get_input_json(row['state'], row['city'], row['station_id'], from_date, to_date))
        final_url = report_url + params
        driver.get(final_url)
        while retries <= 5:
            try:
                driver.refresh()
                excel_button = WebDriverWait(driver, timeout=1000).until(lambda d: d.find_element(By.CSS_SELECTOR, 'a.btn:nth-child(2)'))
                excel_button.click()
                processing_pop = WebDriverWait(driver, timeout=60).until(lambda d: d.find_element(By.CSS_SELECTOR, '.processDiv'))
                WebDriverWait(driver, timeout=4*60).until(EC.invisibility_of_element(processing_pop))
                break
            except:
                retries += 1
        else:
            out.append_stderr(f'error getting {row["station_id"]}')
#     time.sleep(3)
#     driver.quit()

In [11]:
processes = []
for i in range(41, 46):
    file_name = f'stations/stations_{i}.csv'
    out = widgets.Output()
    display(out)
    p = Thread(target=save_stations, args=(file_name, out))
    p.start()
    processes.append(p)

for p in processes:
    p.join()

Output()

Output()

Output()

Output()

Output()