In [683]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import re
import numpy as np
import logging

class WebScraper:
    
    def __init__(self):
        self.session = requests.Session()
        self.rest = 10
        
class OldTMCScraper(WebScraper):
    
    def __init__(self, raw_directory, filters = None):
        super().__init__() 
        self.raw = raw_directory
        self.raw_df = self.load_csv()
        self.filter_options = {
            'AM': True,
            'PM': True,
            'MD': False
        }
        
        if filters:
                for filter_key in filters:
                    if filter_key in self.filter_options:
                        self.filter_options[filter_key] = True
                   
        self.collect_am = self.filter_options.get('AM')
        self.collect_pm = self.filter_options.get('PM')
        self.collect_md = self.filter_options.get('MD')
        
        self.options = Options()
        self.options.add_argument('--headless')
        self.driver = webdriver.Chrome(options=self.options)
        
    def load_csv(self):
        return pd.read_csv(self.raw,sep=';')
    
    def scrape_sub_urls(self):
        urls = self.raw_df["URL"].iloc[0:10]
        expanded_urls= urls.apply(lambda row: self._explode_url(row))
        expanded_urls.rename('Sub_URL',inplace=True)
        return pd.concat([self.raw_df,expanded_urls],axis =1).explode\
                        ('Sub_URL',ignore_index=True)
    
    def _explode_url(self, row):
        
        if row:
            sub_urls = []
            self.driver.get('about:blank') 
            self.driver.get(row)

            try:
                coordinate_element = WebDriverWait(self.driver, self.rest).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'sorting_1'))
                )
                coordinate = coordinate_element.text.strip()

                links_selector = f'td a[target^="0{coordinate}"]'

                links_elements = WebDriverWait(self.driver, self.rest).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, links_selector))
                )

                for link in links_elements:
                    href = link.get_attribute('href')
                    sub_urls.append(href)

            except Exception as e:
                print("An error occurred: ", e)
            
            return sub_urls
    
    def scrape_tmc_counts(self,sub_urls_df):
        tmc_urls = sub_urls_df['Sub_URL'].iloc[1:10]
        tmc_data = tmc_urls.apply(lambda row: self._scrape_tmc(row))
        return pd.json_normalize(tmc_data)

    def _scrape_tmc(self, row):
        if pd.isna(row):
            return None

        try:
            response = self.session.get(row)
            soup = BeautifulSoup(response.text, 'html.parser')
            tmc = {}
            available_am = False
            available_md = False
            available_pm = False
            control_dict = {
                'AM':{'check':None,'table_key':None},
                'MD':{'check':None,'table_key':None},
                'PM':{'check':None,'table_key':None}
                }
            for i in range(2,5):
                period = self._get_peak_hour(soup,key=i)
                if period:
                    peak_start_hr = int(period[:2])

                    if 6 <= peak_start_hr <= 9:
                        control_dict['AM']['table_key'] = i
                        available_am = True
                    elif 10 <= peak_start_hr <= 13:
                        control_dict['MD']['table_key'] = i
                        available_md = True
                    elif 14 <= peak_start_hr <= 19:
                        control_dict['PM']['table_key'] = i
                        available_pm = True
                    else:
                        continue
            
            control_dict['AM']['check'] = self.collect_am and available_am
            control_dict['MD']['check'] = self.collect_md and available_md
            control_dict['PM']['check'] = self.collect_pm and available_pm
            
            tmc.update({
                'date': self._get_date(soup),
                'weather': self._get_weather(soup),
                'type': self._get_type(soup),
                'AM_available': available_am,
                'AM_scraped': self.collect_am,
                'MD_available': available_md,
                'MD_scraped': self.collect_md,                
                'PM_available': available_pm,
                'PM_scraped': self.collect_pm
            })
                
            for key,value in control_dict.items():
                if value.get('check'):
                    tag = key
                    position = value.get('table_key')
                    tmc.update({
                        f'{tag} peak_hour': self._get_peak_hour(soup,position),
                        f'{tag} north_bikes_vol': self._get_north_bikes_vol(soup,position),
                        f'{tag}north_peds_vol': self._get_north_peds_vol(soup,position),
                        f'{tag} north_veh_vol': self._get_north_veh_vol(soup,position)
                        # f'{tag} east_bikes_vol' :
                        # f'{tag} east_peds_vol' :
                        # f'{tag} east_veh_vol' : 
                        # f'{tag} south_bikes_vol' : 
                        # f'{tag} south_peds_vol' : 
                        # f'{tag} south_veh_vol' : 
                        # f'{tag} west_bikes_vol' : 
                        # f'{tag} west_peds_vol' : 
                        # f'{tag} west_veh_vol' : 
                    })
            
            return tmc
            
        except Exception as e:
            logging.error(f'{row} had an error: {e}')
            return None

    def _get_date(self, soup):
        element = soup.select_one("th[valign='TOP']:nth-of-type(3) p")
        return pd.to_datetime(element.text if element else None)

    def _get_weather(self, soup):
        elements = soup.select("th:nth-of-type(13) p")
        return elements[0].text.split()[-1].lower() if elements else None

    def _get_type(self, soup):
        element = soup.select_one("th:nth-of-type(8) p")
        return element.text if element else None

    def _get_peak_hour(self, soup, key):
        element = soup.select_one(f"table:nth-of-type({key}) [valign='MIDDLE'] p.p8")
        return element.text.replace("Maximum Hour ", "") if element else None

    def _get_north_bikes_vol(self, soup, key):
        elements = soup.select(f"table:nth-of-type({key}) [valign='BOTTOM'] tr:-soup-contains('Bikes') tr:nth-of-type(1)")
        return re.findall(r'\d+', elements[0].text)[0] if elements else None

    def _get_north_peds_vol(self, soup, key):
        elements = soup.select(f"table:nth-of-type({key}) table:nth-of-type(1) th:-soup-contains('PEDs')")
        return re.findall(r'\d+', elements[0].text)[-1] if elements else None

    def _get_north_veh_vol(self, soup, key):
        elements = soup.select(f"table:nth-of-type({key}) table:nth-of-type(1) th.s")
        nums = re.findall(r'\d+', "".join([str(e) for e in elements]))
        return nums[2] if len(nums) > 2 else None

class RecentTMCScraper(WebScraper):
    pass

In [684]:
ali = OldTMCScraper(raw_directory='intersection-traffic-movement-counts.csv',filters=['AM','PM'])
ss = ali.scrape_sub_urls()
m = ali.scrape_tmc_counts(ss)

In [685]:
m

Unnamed: 0,date,weather,type,AM_available,AM_scraped,MD_available,MD_scraped,PM_available,PM_scraped,AM peak_hour,AM north_bikes_vol,AMnorth_peds_vol,AM north_veh_vol,PM peak_hour,PM north_bikes_vol,PMnorth_peds_vol,PM north_veh_vol
0,2005-09-27,clear,Signalized,True,True,False,False,True,True,07:55 - 08:55,86,21,367,16:25 - 17:25,7,23,243
1,2003-01-16,cloudy,Signalized,True,True,False,False,True,True,07:55 - 08:55,35,5,318,16:05 - 17:05,2,29,233
2,2001-10-23,cloudy,Signalized,True,True,False,False,True,True,07:55 - 08:55,74,17,364,16:25 - 17:25,9,48,283
3,2000-11-30,rain,Signalized,True,True,False,False,True,True,07:55 - 08:55,55,12,380,16:25 - 17:25,3,18,251
4,1998-10-08,cloudy,Signalized,True,True,False,False,True,True,07:55 - 08:55,66,28,371,16:25 - 17:25,13,34,304
5,1996-09-24,clear,Signalized,True,True,False,False,True,True,07:55 - 08:55,91,24,370,16:05 - 17:05,17,35,263
6,2007-11-22,rain,Signalized,True,True,False,False,True,True,07:55 - 08:55,2,2,104,16:20 - 17:20,2,0,261
7,2008-01-28,clear,Signalized,True,True,False,False,True,True,07:55 - 08:55,0,3,13,15:50 - 16:50,0,6,17
8,2006-01-19,cloudy,Non-Signalized,True,True,False,False,True,True,07:55 - 08:55,0,8,3,16:05 - 17:05,0,20,8
