In [16]:
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
import signal
import pandas as pd

# Creating timeout error to limit the runtime of a function if needed
class TimeoutException(Exception): # Creating custom error
    pass

def timeout_handler(): # Creating function to handle error
    print('Reached maximum allotted time')
    raise TimeoutException
    
signal.signal(signal.SIGALRM, timeout_handler);

In [17]:
class WebTools():
    def get_title(self,url):
        '''
        This function reads the html and finds the title, returning it as a string
        '''
        if (url == None) or ('http' not in url):
            raise ValueError(f'Not a useable url: "{url}"')
        try:
            response = urlopen(url)
            soup = BeautifulSoup(response, 'html.parser')
            title = soup.title.get_text()
            del url
            return title
        except:
            raise ValueError(f'Not a useable url: "{url}"')
        
    def find_all_links(self,page):
        '''
        Finds all links on page, this is a WIP as it doesn't find all links
        '''
        soup = BeautifulSoup(page,features="lxml")
        all_links = []
        for line in soup.find_all('a'):
            line = line.get('href')
            try:
                if ('http' not in line):
                    continue
            except:
                continue
            all_links.append(line)
        del soup,line,page
        return list(set(all_links))
    
    def find_relevant_links_and_titles(self,all_links,keywords,search_title = False):
        name_and_link = []
        keywords_upper = [string.capitalize() for string in keywords]
        for link in all_links:
            signal.alarm(5)
            if search_title:
                try:
                    title = self.get_title(link).lower()
                    if any(substring in title for substring in keywords):
                        name_and_link.append((title,link)) 
                    else:
                        pass
                except:
                    continue
                else: 
                    signal.alarm(0)
            else:
                try:
                    link = link.lower()
                    if any(substring in link for substring in keywords):
                        title = self.get_title(link)
                        name_and_link.append((title,link))
                    else:              
                        pass
                except:
                    continue
                else:
                    signal.alarm(0)
        del keywords,keywords_upper,title,link,all_links
        return list(set(name_and_link))
        
WT = WebTools()

In [3]:
def main():
    page = urlopen("https://chcrpa.org/data-and-analyses/data/").read()

    all_links = WT.find_all_links(page)

    keywords = ['transport','car','train','railroad','vehicle','fuel','travel','port','road','highway','bridge','bus','electric vehicle','airport','transit','sidewalk','bike','sidewalk','traffic']
    name_and_link = WT.find_relevant_links_and_titles(all_links,keywords,search_title=False)

    print(set(name_and_link))

if __name__ == '__main__':
    main()

{('Functional Classification Maps', 'https://www.tn.gov/tdot/driver-how-do-i/look-at-or-order-state-maps/maps/annual-average-daily-traffic-maps1.html'), ('ArcGIS Web Application', 'https://pwgis.chattanooga.gov/portal/apps/webappviewer/index.html?id=2b066c4f02a14f288f0134a02eeca91e'), ('Transportation Planning Organization – CHCRPA', 'https://chcrpa.org/transportation-planning-organization/'), ('Employment Opportunities – CHCRPA', 'https://chcrpa.org/employment-opportunities/'), ('Home | CARTA | Chattanooga’s Ride', 'http://www.carta-bus.org/')}


In [20]:
test_dict = {
    'Title':["Data – CHCRPA"],
    'url':["https://chcrpa.org/data-and-analyses/data/"],
    'time':[1685977141],
}

test_data = pd.DataFrame.from_dict(test_dict)

test_data.to_csv('test_data.csv',header=['title','url','time'])

In [21]:
test_data

Unnamed: 0,Title,url,time
0,Data – CHCRPA,https://chcrpa.org/data-and-analyses/data/,1685977141


In [23]:
read_csv = pd.read_csv('test_data.csv',header=0)
read_csv

Unnamed: 0.1,Unnamed: 0,Title,url,time
0,0,Data – CHCRPA,https://chcrpa.org/data-and-analyses/data/,1685977141
