In [2]:
from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime
import pandas as pd

#url to scrape for one location and 6 months
url='https://airquality.ie/readings?station=EPA-25&dateFrom=01+Jan+2023&dateTo=30+Jun+2023'
#create content to scrape
r = requests.get(url)
#get webpage
soup = BeautifulSoup(r.content, 'html.parser')
#print response
print(soup)


<!DOCTYPE html>

<html lang="en">
<head>
<title>Readings | AirQuality.ie</title>
<meta content="Charts from Ireland's EPA showing average particulate and gas readings from monitoring stations taken at regular intervals throughout the day. Select a monitoring station and date range to view data." name="description"/>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-28794554-3"></script>
<script>
    window.dataLayer = window.dataLayer || [];
    function gtag(){dataLayer.push(arguments);}
    gtag('js', new Date());
    gtag('config', "UA-28794554-3");
  </script>
<!-- Required Meta Tags -->
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="air quality ireland, air pollution, air quality index, air qualities, air quality monitoring, air pollution map, air quality map, air quality sen

In [3]:
#look at the javascript that contains the timestamps and values
get_data = soup.find('script', string=re.compile('Date.UTC')).string

#use reqex to match the date and values
pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'

#get all matches
matches = re.findall(pattern, get_data)

#convert to datetimes and pm values
data = []
for match in matches:
    year, month, day, hour, minute, second, value = map(float, match)
    
    #looks like month is set from 0 so Jan = 0 month.. add a 1 to make it right
    timestamp = datetime(int(year), int(month)+1, int(day), int(hour), int(minute), int(second))
    #append each row
    data.append((timestamp, float(value)))

#create dataframe
df = pd.DataFrame(data, columns=['Timestamp', 'Value'])

#print data
print(df)

                Timestamp  Value
0     2023-01-01 01:00:00  10.80
1     2023-01-01 02:00:00   7.89
2     2023-01-01 03:00:00   5.84
3     2023-01-01 04:00:00   5.12
4     2023-01-01 05:00:00   4.58
...                   ...    ...
12856 2023-06-30 19:00:00   3.67
12857 2023-06-30 20:00:00   4.16
12858 2023-06-30 21:00:00   4.65
12859 2023-06-30 22:00:00   6.02
12860 2023-06-30 23:00:00   6.11

[12861 rows x 2 columns]


In [39]:
#put it together with more variables and date ranges so it can pull all locations, all months for all specified years.
#starting date point
dateFrom = '01+Jan+'
#dateFrom = '01+Jul+'
#end date point
dateTo = '30+Jun+'
#dateTo = '31+Dec+'
#years to capture
year = {2023}

dataset = pd.DataFrame(columns=["Timestamp", "Value", "PM", "Location"])
#all_dataset = pd.DataFrame(columns=["Timestamp", "Value", "PM", "Location"])

for y in year:
    for i in range(10,110):#range should go to 110
        
        #break the url up into the string and variables
        url='https://airquality.ie/readings?station=EPA-'+str(i)+'&dateFrom='+dateFrom+str(y)+'&dateTo='+dateTo+str(y)
        #print it to keep track of which location and date is being processes as the script is running
        print(url)
        #create content to scrape
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        #print(soup)
        #if there is no data the m-0 class is called with a no data message..
        no_data = soup.select(".m-0")
        if no_data:
            #print to track what locations exist/don't exist
            print(no_data)
        else:
            # find the date and values within the data tag..
            get_data = soup.find_all('script')#, string=re.compile('Date.UTC')).string
            #print("get_data")
            #print(get_data)
         #   print("get_data.string")
        #    print(get_data.string)
            #look for the same message to get the location of the air quality monitor, begins with Air Quality Levels at then has a comma and ends with a single quote.
            location = re.search(r"Air Quality Levels at.*?,*?'", str(get_data))

            #if location exists assign it.. otherwise mark it as unknown
            if location:
                loc = location.group(0)
                print(loc)
            else:
                loc = "Unknown Location"

            #look for PM2.5 data
            pm2_5 = re.search(r'"PM2.5",', str(get_data))
            pm2_5_values = []

            #if it exists pull the full block from pm2.5 to //endforeach to get all timestamp and pm values data
            if pm2_5:
                pattern = r'"PM2.5".*?//endforeach'
                
                #search for the entire block from 'pm2_5' to '//endforeach'
                pm2_5_data_block = re.search(pattern, str(get_data), re.DOTALL)
              #  print(pm2_5_data_block.group(0))
                if pm2_5_data_block:
                    #print(pm2_5_data_block)
                    #asked chatgpt to 'give me the regex to parse this date and value, [ Date.UTC(2023,0,9,4,0,0), 2.86]'
                    #match the timestamp and value portion only
                    pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'
      
                   #get all matches in the pm2.5 block only
                    pm2_5_values = re.findall(pattern, pm2_5_data_block.group(0))
                else:
                    print("No match found")

            #print(pm2_5_values)
            
            #do the same again for pm10
            pm10 = re.search(r'"PM10",', str(get_data))
            pm10_values = []
            if pm10:
                pattern = r'"PM10".*?//endforeach'
                
                #search for the entire block from 'pm10' to '//endforeach'
                pm10_data_block = re.search(pattern,str(get_data), re.DOTALL)

                #get all matches in the pm10 block only
                if pm10_data_block:
                    pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'
                    pm10_values = re.findall(pattern, pm10_data_block.group(0))
                else:
                    print("No match found")

            #print(pm10_values)
            pm_val = float
        
            #assign the values to the name of the pm type for easier adding to the dataset
            records = {'pm_2.5':pm2_5_values,'pm_10':pm10_values}
            # print(records)
            mydata = []
            if records:
                #get the key value pairs
                for key, vals in records.items():
                    #define the pm_val based on the key
                    if key == 'pm_2.5':
                        pm_val = 2.5
                    elif key == 'pm_10':
                        pm_val = 10
                    else:
                        pm_val = 0  #0 can be used for unknown so it doesn't break the parser

                    #parse out the values into the timestamps and pm vals
                    for val in vals:
                     #   print(val)
                        year, month, day, hour, minute, second, value = map(float, val)
                        #add 1 to the month as it looks like January = month 0 (days not affected)
                        timestamp = datetime(int(year), int(month)+1, int(day), int(hour), int(minute), int(second))
                    
                        #put it together in a data frame
                        mydata.append({"Timestamp": timestamp, "Value": float(value), "PM":pm_val, "Location": loc})

        newdata = pd.DataFrame(mydata)      
        dataset = pd.concat([newdata,dataset])
#save to a csv file so it's available to read in (but don't open and save as it will have millions of rows and they will be lost by Excel!
dataset.to_csv('air_quality_data_Jan_Jun_2023.csv', index=False)       

https://airquality.ie/readings?station=EPA-10&dateFrom=01+Jan+2023&dateTo=30+Jun+2023
Air Quality Levels at Heatherton Park, Cork'
https://airquality.ie/readings?station=EPA-11&dateFrom=01+Jan+2023&dateTo=30+Jun+2023


  dataset = pd.concat([newdata,dataset])


Air Quality Levels at Winetavern Street, Dublin 8'
https://airquality.ie/readings?station=EPA-12&dateFrom=01+Jan+2023&dateTo=30+Jun+2023
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-13&dateFrom=01+Jan+2023&dateTo=30+Jun+2023
Air Quality Levels at Bray, Co. Wicklow'
https://airquality.ie/readings?station=EPA-14&dateFrom=01+Jan+2023&dateTo=30+Jun+2023
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-15&dateFrom=01+Jan+2023&dateTo=30+Jun+2023
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-16&dateFrom=01+Jan+2023&dateTo=30+Jun+2023
Air Quality Levels at Portlaoise, Co. Laois'
https://airquality.ie/readings?station=EPA-17&dateFrom=

In [None]:
#2021 ... 17,21-29,33,34,36,39,43-54,56-64,66-72,74-85,101-103,  done 10,11,13,16 still to be done
#2022 ... 17,21-29,33,34,36,39,43-54,56-64,66-72,74-76,78-80,82-85,101-109
#2023 ... 17,21-29,33,34,36,39,43-64,66-67 stuck on 67..

In [None]:
#url to scrape
url='https://airquality.ie/readings?station=EPA-16&dateFrom=01+Jan+2023&dateTo=30+Jun+2023'
#create content to scrape
r = requests.get(url)
#r = requests.get(url,headers={'user-agent':'some agent'})
#html_doc= r.text
#soup = BeautifulSoup(html_doc)
soup = BeautifulSoup(r.content, 'html.parser')

print(soup)

In [21]:
dataset.head(10)

Unnamed: 0,Timestamp,Value,PM,Location
0,2023-07-01 01:00:00,17.02,2.5,"Air Quality Levels at Ennis, Co. Clare'"
1,2023-07-01 02:00:00,13.7,2.5,"Air Quality Levels at Ennis, Co. Clare'"
2,2023-07-01 03:00:00,11.32,2.5,"Air Quality Levels at Ennis, Co. Clare'"
3,2023-07-01 04:00:00,15.25,2.5,"Air Quality Levels at Ennis, Co. Clare'"
4,2023-07-01 05:00:00,17.57,2.5,"Air Quality Levels at Ennis, Co. Clare'"
5,2023-07-01 06:00:00,22.84,2.5,"Air Quality Levels at Ennis, Co. Clare'"
6,2023-07-01 07:00:00,30.43,2.5,"Air Quality Levels at Ennis, Co. Clare'"
7,2023-07-01 08:00:00,25.79,2.5,"Air Quality Levels at Ennis, Co. Clare'"
8,2023-07-01 09:00:00,22.03,2.5,"Air Quality Levels at Ennis, Co. Clare'"
9,2023-07-01 10:00:00,21.5,2.5,"Air Quality Levels at Ennis, Co. Clare'"


In [36]:
newdata

Unnamed: 0,Timestamp,Value,PM,Location
0,2023-01-01 01:00:00,98.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"
1,2023-01-01 02:00:00,77.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"
2,2023-01-01 03:00:00,43.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"
3,2023-01-01 04:00:00,29.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"
4,2023-01-01 05:00:00,38.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"
...,...,...,...,...
4118,2023-06-30 19:00:00,6.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"
4119,2023-06-30 20:00:00,11.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"
4120,2023-06-30 21:00:00,6.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"
4121,2023-06-30 22:00:00,5.0,10,"Air Quality Levels at Castlebar, Co. Mayo'"


In [37]:
mydata

[{'Timestamp': datetime.datetime(2023, 1, 1, 1, 0),
  'Value': 98.0,
  'PM': 10,
  'Location': "Air Quality Levels at Castlebar, Co. Mayo'"},
 {'Timestamp': datetime.datetime(2023, 1, 1, 2, 0),
  'Value': 77.0,
  'PM': 10,
  'Location': "Air Quality Levels at Castlebar, Co. Mayo'"},
 {'Timestamp': datetime.datetime(2023, 1, 1, 3, 0),
  'Value': 43.0,
  'PM': 10,
  'Location': "Air Quality Levels at Castlebar, Co. Mayo'"},
 {'Timestamp': datetime.datetime(2023, 1, 1, 4, 0),
  'Value': 29.0,
  'PM': 10,
  'Location': "Air Quality Levels at Castlebar, Co. Mayo'"},
 {'Timestamp': datetime.datetime(2023, 1, 1, 5, 0),
  'Value': 38.0,
  'PM': 10,
  'Location': "Air Quality Levels at Castlebar, Co. Mayo'"},
 {'Timestamp': datetime.datetime(2023, 1, 1, 6, 0),
  'Value': 41.0,
  'PM': 10,
  'Location': "Air Quality Levels at Castlebar, Co. Mayo'"},
 {'Timestamp': datetime.datetime(2023, 1, 1, 7, 0),
  'Value': 44.0,
  'PM': 10,
  'Location': "Air Quality Levels at Castlebar, Co. Mayo'"},
 {'Tim

In [38]:
dataset

Unnamed: 0,Timestamp,Value,PM,Location
0,2023-01-01 01:00:00,98.00,10.0,"Air Quality Levels at Castlebar, Co. Mayo'"
1,2023-01-01 02:00:00,77.00,10.0,"Air Quality Levels at Castlebar, Co. Mayo'"
2,2023-01-01 03:00:00,43.00,10.0,"Air Quality Levels at Castlebar, Co. Mayo'"
3,2023-01-01 04:00:00,29.00,10.0,"Air Quality Levels at Castlebar, Co. Mayo'"
4,2023-01-01 05:00:00,38.00,10.0,"Air Quality Levels at Castlebar, Co. Mayo'"
...,...,...,...,...
8599,2023-06-30 19:00:00,7.55,10.0,"Air Quality Levels at Ennis, Co. Clare'"
8600,2023-06-30 20:00:00,8.13,10.0,"Air Quality Levels at Ennis, Co. Clare'"
8601,2023-06-30 21:00:00,8.36,10.0,"Air Quality Levels at Ennis, Co. Clare'"
8602,2023-06-30 22:00:00,9.38,10.0,"Air Quality Levels at Ennis, Co. Clare'"


In [40]:
#put it together with more variables and date ranges so it can pull all locations, all months for all specified years.
#starting date point
#dateFrom = '01+Jan+'
dateFrom = '01+Jul+'
#end date point
#dateTo = '30+Jun+'
dateTo = '31+Dec+'
#years to capture
year = {2023}

dataset = pd.DataFrame(columns=["Timestamp", "Value", "PM", "Location"])
#all_dataset = pd.DataFrame(columns=["Timestamp", "Value", "PM", "Location"])

for y in year:
    for i in range(10,110):#range should go to 110
        
        #break the url up into the string and variables
        url='https://airquality.ie/readings?station=EPA-'+str(i)+'&dateFrom='+dateFrom+str(y)+'&dateTo='+dateTo+str(y)
        #print it to keep track of which location and date is being processes as the script is running
        print(url)
        #create content to scrape
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        #print(soup)
        #if there is no data the m-0 class is called with a no data message..
        no_data = soup.select(".m-0")
        if no_data:
            #print to track what locations exist/don't exist
            print(no_data)
        else:
            # find the date and values within the data tag..
            get_data = soup.find_all('script')#, string=re.compile('Date.UTC')).string
            #print("get_data")
            #print(get_data)
         #   print("get_data.string")
        #    print(get_data.string)
            #look for the same message to get the location of the air quality monitor, begins with Air Quality Levels at then has a comma and ends with a single quote.
            location = re.search(r"Air Quality Levels at.*?,*?'", str(get_data))

            #if location exists assign it.. otherwise mark it as unknown
            if location:
                loc = location.group(0)
                print(loc)
            else:
                loc = "Unknown Location"

            #look for PM2.5 data
            pm2_5 = re.search(r'"PM2.5",', str(get_data))
            pm2_5_values = []

            #if it exists pull the full block from pm2.5 to //endforeach to get all timestamp and pm values data
            if pm2_5:
                pattern = r'"PM2.5".*?//endforeach'
                
                #search for the entire block from 'pm2_5' to '//endforeach'
                pm2_5_data_block = re.search(pattern, str(get_data), re.DOTALL)
              #  print(pm2_5_data_block.group(0))
                if pm2_5_data_block:
                    #print(pm2_5_data_block)
                    #asked chatgpt to 'give me the regex to parse this date and value, [ Date.UTC(2023,0,9,4,0,0), 2.86]'
                    #match the timestamp and value portion only
                    pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'
      
                   #get all matches in the pm2.5 block only
                    pm2_5_values = re.findall(pattern, pm2_5_data_block.group(0))
                else:
                    print("No match found")

            #print(pm2_5_values)
            
            #do the same again for pm10
            pm10 = re.search(r'"PM10",', str(get_data))
            pm10_values = []
            if pm10:
                pattern = r'"PM10".*?//endforeach'
                
                #search for the entire block from 'pm10' to '//endforeach'
                pm10_data_block = re.search(pattern,str(get_data), re.DOTALL)

                #get all matches in the pm10 block only
                if pm10_data_block:
                    pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'
                    pm10_values = re.findall(pattern, pm10_data_block.group(0))
                else:
                    print("No match found")

            #print(pm10_values)
            pm_val = float
        
            #assign the values to the name of the pm type for easier adding to the dataset
            records = {'pm_2.5':pm2_5_values,'pm_10':pm10_values}
            # print(records)
            mydata = []
            if records:
                #get the key value pairs
                for key, vals in records.items():
                    #define the pm_val based on the key
                    if key == 'pm_2.5':
                        pm_val = 2.5
                    elif key == 'pm_10':
                        pm_val = 10
                    else:
                        pm_val = 0  #0 can be used for unknown so it doesn't break the parser

                    #parse out the values into the timestamps and pm vals
                    for val in vals:
                     #   print(val)
                        year, month, day, hour, minute, second, value = map(float, val)
                        #add 1 to the month as it looks like January = month 0 (days not affected)
                        timestamp = datetime(int(year), int(month)+1, int(day), int(hour), int(minute), int(second))
                    
                        #put it together in a data frame
                        mydata.append({"Timestamp": timestamp, "Value": float(value), "PM":pm_val, "Location": loc})

        newdata = pd.DataFrame(mydata)      
        dataset = pd.concat([newdata,dataset])
#save to a csv file so it's available to read in (but don't open and save as it will have millions of rows and they will be lost by Excel!
dataset.to_csv('air_quality_data_Jul_Dec_2023.csv', index=False)          

https://airquality.ie/readings?station=EPA-10&dateFrom=01+Jul+2023&dateTo=31+Dec+2023
Air Quality Levels at Heatherton Park, Cork'
https://airquality.ie/readings?station=EPA-11&dateFrom=01+Jul+2023&dateTo=31+Dec+2023


  dataset = pd.concat([newdata,dataset])


Air Quality Levels at Winetavern Street, Dublin 8'
https://airquality.ie/readings?station=EPA-12&dateFrom=01+Jul+2023&dateTo=31+Dec+2023
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-13&dateFrom=01+Jul+2023&dateTo=31+Dec+2023
Air Quality Levels at Bray, Co. Wicklow'
https://airquality.ie/readings?station=EPA-14&dateFrom=01+Jul+2023&dateTo=31+Dec+2023
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-15&dateFrom=01+Jul+2023&dateTo=31+Dec+2023
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-16&dateFrom=01+Jul+2023&dateTo=31+Dec+2023
Air Quality Levels at Portlaoise, Co. Laois'
https://airquality.ie/readings?station=EPA-17&dateFrom=

In [41]:
#put it together with more variables and date ranges so it can pull all locations, all months for all specified years.
#starting date point
dateFrom = '01+Jan+'
#dateFrom = '01+Jul+'
#end date point
#dateTo = '30+Jun+'
dateTo = '31+Dec+'
#years to capture
year = {2022}

dataset = pd.DataFrame(columns=["Timestamp", "Value", "PM", "Location"])
#all_dataset = pd.DataFrame(columns=["Timestamp", "Value", "PM", "Location"])

for y in year:
    for i in range(10,110):#range should go to 110
        
        #break the url up into the string and variables
        url='https://airquality.ie/readings?station=EPA-'+str(i)+'&dateFrom='+dateFrom+str(y)+'&dateTo='+dateTo+str(y)
        #print it to keep track of which location and date is being processes as the script is running
        print(url)
        #create content to scrape
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        #print(soup)
        #if there is no data the m-0 class is called with a no data message..
        no_data = soup.select(".m-0")
        if no_data:
            #print to track what locations exist/don't exist
            print(no_data)
        else:
            # find the date and values within the data tag..
            get_data = soup.find_all('script')#, string=re.compile('Date.UTC')).string
            #print("get_data")
            #print(get_data)
         #   print("get_data.string")
        #    print(get_data.string)
            #look for the same message to get the location of the air quality monitor, begins with Air Quality Levels at then has a comma and ends with a single quote.
            location = re.search(r"Air Quality Levels at.*?,*?'", str(get_data))

            #if location exists assign it.. otherwise mark it as unknown
            if location:
                loc = location.group(0)
                print(loc)
            else:
                loc = "Unknown Location"

            #look for PM2.5 data
            pm2_5 = re.search(r'"PM2.5",', str(get_data))
            pm2_5_values = []

            #if it exists pull the full block from pm2.5 to //endforeach to get all timestamp and pm values data
            if pm2_5:
                pattern = r'"PM2.5".*?//endforeach'
                
                #search for the entire block from 'pm2_5' to '//endforeach'
                pm2_5_data_block = re.search(pattern, str(get_data), re.DOTALL)
              #  print(pm2_5_data_block.group(0))
                if pm2_5_data_block:
                    #print(pm2_5_data_block)
                    #asked chatgpt to 'give me the regex to parse this date and value, [ Date.UTC(2023,0,9,4,0,0), 2.86]'
                    #match the timestamp and value portion only
                    pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'
      
                   #get all matches in the pm2.5 block only
                    pm2_5_values = re.findall(pattern, pm2_5_data_block.group(0))
                else:
                    print("No match found")

            #print(pm2_5_values)
            
            #do the same again for pm10
            pm10 = re.search(r'"PM10",', str(get_data))
            pm10_values = []
            if pm10:
                pattern = r'"PM10".*?//endforeach'
                
                #search for the entire block from 'pm10' to '//endforeach'
                pm10_data_block = re.search(pattern,str(get_data), re.DOTALL)

                #get all matches in the pm10 block only
                if pm10_data_block:
                    pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'
                    pm10_values = re.findall(pattern, pm10_data_block.group(0))
                else:
                    print("No match found")

            #print(pm10_values)
            pm_val = float
        
            #assign the values to the name of the pm type for easier adding to the dataset
            records = {'pm_2.5':pm2_5_values,'pm_10':pm10_values}
            # print(records)
            mydata = []
            if records:
                #get the key value pairs
                for key, vals in records.items():
                    #define the pm_val based on the key
                    if key == 'pm_2.5':
                        pm_val = 2.5
                    elif key == 'pm_10':
                        pm_val = 10
                    else:
                        pm_val = 0  #0 can be used for unknown so it doesn't break the parser

                    #parse out the values into the timestamps and pm vals
                    for val in vals:
                     #   print(val)
                        year, month, day, hour, minute, second, value = map(float, val)
                        #add 1 to the month as it looks like January = month 0 (days not affected)
                        timestamp = datetime(int(year), int(month)+1, int(day), int(hour), int(minute), int(second))
                    
                        #put it together in a data frame
                        mydata.append({"Timestamp": timestamp, "Value": float(value), "PM":pm_val, "Location": loc})

        newdata = pd.DataFrame(mydata)      
        dataset = pd.concat([newdata,dataset])
#save to a csv file so it's available to read in (but don't open and save as it will have millions of rows and they will be lost by Excel!       
dataset.to_csv('air_quality_data_Jan_Dec_2022.csv', index=False) 

https://airquality.ie/readings?station=EPA-10&dateFrom=01+Jan+2022&dateTo=31+Dec+2022
Air Quality Levels at Heatherton Park, Cork'
https://airquality.ie/readings?station=EPA-11&dateFrom=01+Jan+2022&dateTo=31+Dec+2022


  dataset = pd.concat([newdata,dataset])


Air Quality Levels at Winetavern Street, Dublin 8'
https://airquality.ie/readings?station=EPA-12&dateFrom=01+Jan+2022&dateTo=31+Dec+2022
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-13&dateFrom=01+Jan+2022&dateTo=31+Dec+2022
Air Quality Levels at Bray, Co. Wicklow'
https://airquality.ie/readings?station=EPA-14&dateFrom=01+Jan+2022&dateTo=31+Dec+2022
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-15&dateFrom=01+Jan+2022&dateTo=31+Dec+2022
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-16&dateFrom=01+Jan+2022&dateTo=31+Dec+2022
Air Quality Levels at Portlaoise, Co. Laois'
https://airquality.ie/readings?station=EPA-17&dateFrom=

In [42]:
#put it together with more variables and date ranges so it can pull all locations, all months for all specified years.
#starting date point
dateFrom = '01+Jan+'
#dateFrom = '01+Jul+'
#end date point
#dateTo = '30+Jun+'
dateTo = '31+Dec+'
#years to capture
year = {2021}

dataset = pd.DataFrame(columns=["Timestamp", "Value", "PM", "Location"])
#all_dataset = pd.DataFrame(columns=["Timestamp", "Value", "PM", "Location"])

for y in year:
    for i in range(10,110):#range should go to 110
        
        #break the url up into the string and variables
        url='https://airquality.ie/readings?station=EPA-'+str(i)+'&dateFrom='+dateFrom+str(y)+'&dateTo='+dateTo+str(y)
        #print it to keep track of which location and date is being processes as the script is running
        print(url)
        #create content to scrape
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        #print(soup)
        #if there is no data the m-0 class is called with a no data message..
        no_data = soup.select(".m-0")
        if no_data:
            #print to track what locations exist/don't exist
            print(no_data)
        else:
            # find the date and values within the data tag..
            get_data = soup.find_all('script')#, string=re.compile('Date.UTC')).string
            #print("get_data")
            #print(get_data)
         #   print("get_data.string")
        #    print(get_data.string)
            #look for the same message to get the location of the air quality monitor, begins with Air Quality Levels at then has a comma and ends with a single quote.
            location = re.search(r"Air Quality Levels at.*?,*?'", str(get_data))

            #if location exists assign it.. otherwise mark it as unknown
            if location:
                loc = location.group(0)
                print(loc)
            else:
                loc = "Unknown Location"

            #look for PM2.5 data
            pm2_5 = re.search(r'"PM2.5",', str(get_data))
            pm2_5_values = []

            #if it exists pull the full block from pm2.5 to //endforeach to get all timestamp and pm values data
            if pm2_5:
                pattern = r'"PM2.5".*?//endforeach'
                
                #search for the entire block from 'pm2_5' to '//endforeach'
                pm2_5_data_block = re.search(pattern, str(get_data), re.DOTALL)
              #  print(pm2_5_data_block.group(0))
                if pm2_5_data_block:
                    #print(pm2_5_data_block)
                    #asked chatgpt to 'give me the regex to parse this date and value, [ Date.UTC(2023,0,9,4,0,0), 2.86]'
                    #match the timestamp and value portion only
                    pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'
      
                   #get all matches in the pm2.5 block only
                    pm2_5_values = re.findall(pattern, pm2_5_data_block.group(0))
                else:
                    print("No match found")

            #print(pm2_5_values)
            
            #do the same again for pm10
            pm10 = re.search(r'"PM10",', str(get_data))
            pm10_values = []
            if pm10:
                pattern = r'"PM10".*?//endforeach'
                
                #search for the entire block from 'pm10' to '//endforeach'
                pm10_data_block = re.search(pattern,str(get_data), re.DOTALL)

                #get all matches in the pm10 block only
                if pm10_data_block:
                    pattern = r'Date\.UTC\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\),\s*([\d.]+)'
                    pm10_values = re.findall(pattern, pm10_data_block.group(0))
                else:
                    print("No match found")

            #print(pm10_values)
            pm_val = float
        
            #assign the values to the name of the pm type for easier adding to the dataset
            records = {'pm_2.5':pm2_5_values,'pm_10':pm10_values}
            # print(records)
            mydata = []
            if records:
                #get the key value pairs
                for key, vals in records.items():
                    #define the pm_val based on the key
                    if key == 'pm_2.5':
                        pm_val = 2.5
                    elif key == 'pm_10':
                        pm_val = 10
                    else:
                        pm_val = 0  #0 can be used for unknown so it doesn't break the parser

                    #parse out the values into the timestamps and pm vals
                    for val in vals:
                     #   print(val)
                        year, month, day, hour, minute, second, value = map(float, val)
                        #add 1 to the month as it looks like January = month 0 (days not affected)
                        timestamp = datetime(int(year), int(month)+1, int(day), int(hour), int(minute), int(second))
                    
                        #put it together in a data frame
                        mydata.append({"Timestamp": timestamp, "Value": float(value), "PM":pm_val, "Location": loc})

        newdata = pd.DataFrame(mydata)      
        dataset = pd.concat([newdata,dataset])
#save to a csv file so it's available to read in (but don't open and save as it will have millions of rows and they will be lost by Excel!  
dataset.to_csv('air_quality_data_Jan_Dec_2021.csv', index=False) 

https://airquality.ie/readings?station=EPA-10&dateFrom=01+Jan+2021&dateTo=31+Dec+2021
Air Quality Levels at Heatherton Park, Cork'
https://airquality.ie/readings?station=EPA-11&dateFrom=01+Jan+2021&dateTo=31+Dec+2021


  dataset = pd.concat([newdata,dataset])


Air Quality Levels at Winetavern Street, Dublin 8'
https://airquality.ie/readings?station=EPA-12&dateFrom=01+Jan+2021&dateTo=31+Dec+2021
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-13&dateFrom=01+Jan+2021&dateTo=31+Dec+2021
Air Quality Levels at Bray, Co. Wicklow'
https://airquality.ie/readings?station=EPA-14&dateFrom=01+Jan+2021&dateTo=31+Dec+2021
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-15&dateFrom=01+Jan+2021&dateTo=31+Dec+2021
[<p class="m-0">
                    No data found for this station and date range.
                                    </p>]
https://airquality.ie/readings?station=EPA-16&dateFrom=01+Jan+2021&dateTo=31+Dec+2021
Air Quality Levels at Portlaoise, Co. Laois'
https://airquality.ie/readings?station=EPA-17&dateFrom=