In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import requests
import time
import os
from datetime import datetime

# === CONFIGURATION ===
API_KEY = 'this_is_where_the_api_key_was'
BASE_URL = 'http://api.openweathermap.org/data/2.5/air_pollution/history'
TIME_LAPSE = 3600  # 1-hour window in seconds

# === FILE PATHS ===
file_path = "/content/drive/MyDrive/requests/request135_139.csv"
output_dir = "/content/drive/MyDrive/results/"
output_file = os.path.join(output_dir, "pollution_results_incremental135_139.csv")

# === LOAD AND CLEAN INPUT ===
data = pd.read_csv(file_path, na_values=['', ' ', 'N/A'])
data = data.dropna(subset=['timestamp_hour', 'latitude', 'longitude'])
data['timestamp_hour'] = pd.to_datetime(data['timestamp_hour'], utc=True)

# Round lat/lon for precision-safe matching
data['lat_round'] = data['latitude'].round(5)
data['lon_round'] = data['longitude'].round(5)

# === PREPARE OUTPUT DIRECTORY AND FILE ===
os.makedirs(output_dir, exist_ok=True)

if os.path.exists(output_file):
    existing = pd.read_csv(output_file)
    existing['lat_round'] = existing['latitude'].round(5)
    existing['lon_round'] = existing['longitude'].round(5)
    processed_keys = set(zip(existing['lat_round'], existing['lon_round'], existing['start_time']))
else:
    header_cols = ['latitude', 'longitude', 'start_time', 'end_time', 'AQI',
                   'pm2_5', 'pm10', 'no', 'no2', 'o3', 'so2', 'co']
    pd.DataFrame(columns=header_cols).to_csv(output_file, index=False)
    processed_keys = set()

print(f"Cleaned data: {len(data)} rows to process")
start_script_time = time.time()

# === LOOP THROUGH EACH ROW ===
for i, row in data.iterrows():
    lat = row['latitude']
    lon = row['longitude']
    ts = row['timestamp_hour']
    lat_r = row['lat_round']
    lon_r = row['lon_round']

    try:
        start_time = int(ts.timestamp())
        end_time = start_time + TIME_LAPSE
    except Exception as e:
        print(f"Timestamp error at row {i}: {ts} - {e}")
        continue

    key = (lat_r, lon_r, start_time)
    if key in processed_keys:
        continue  # Skip already processed row

    # === API Request ===
    url = f"{BASE_URL}?lat={lat}&lon={lon}&start={start_time}&end={end_time}&appid={API_KEY}"

    try:
        response = requests.get(url)
        if response.status_code == 200:
            pollution_data = response.json()
            if 'list' in pollution_data and len(pollution_data['list']) > 0:
                data_point = pollution_data['list'][0]
                aqi = data_point['main']['aqi']
                comp = data_point['components']
            else:
                aqi = 'N/A'
                comp = {}
        else:
            print(f"API status {response.status_code} at row {i}")
            aqi = 'Error'
            comp = {}
    except Exception as e:
        print(f"API error at row {i}: {e}")
        aqi = 'Error'
        comp = {}

    # === Append Result ===
    result = {
        'latitude': lat,
        'longitude': lon,
        'start_time': start_time,
        'end_time': end_time,
        'AQI': aqi,
        'pm2_5': comp.get('pm2_5'),
        'pm10': comp.get('pm10'),
        'no': comp.get('no'),
        'no2': comp.get('no2'),
        'o3': comp.get('o3'),
        'so2': comp.get('so2'),
        'co': comp.get('co')
    }

    pd.DataFrame([result]).to_csv(output_file, mode='a', header=False, index=False)

    time.sleep(0.0000000001)  # respect rate limits

# === TOTAL TIME ===
total_time = time.time() - start_script_time
mins, secs = divmod(total_time, 60)

Cleaned data: 47374 rows to process
