In [1]:
import pandas as pd
import time
import signal
from urllib.request import urlopen
from bs4 import BeautifulSoup
import sys

class TimeoutException(Exception): # Creating custom error
   pass

def timeout_handler(): # Creating function to handle error
   raise TimeoutException
    
signal.signal(signal.SIGALRM, timeout_handler);



In [2]:
def add_to_dataframe(df,name_link_type):
   '''
   This method can be used to add/save websites to the website csv, returns dataframe with new 
   entries added to end of frame.

   df: This is the dataframe you want to add the entries to.

   name_link_type: This is an array that contains tuples with title of a website in the first position
   the link in the second position, and the type of each dataset. 
   Should Look like this -> [('Title1','www.link1.com','file extension 1'),('Title2','www.link2.com','file extension 2')]
   '''
   new_df = df.copy()
   del df

   all_links = list(new_df.iloc[:,1])

   for tup in name_link_type:
      if tup[1] not in all_links:
         new_df.loc[len(new_df.index)] = [tup[0], tup[1], tup[2], round(time.time()), "empty"]

   return new_df

def remove_entry(df,idx,save_immediately=False):
    '''
    This method deletes an entry by index, saves changes immediately depending on you choice, and then returns the new dataframe
    If save_immediately is set to false, then if the program is unable to reach then end of the main loop, the change will not be saved,
    

    df: Dataframe you are working with

    idx: integer, index of the row/entry you want deleted

    returns: New dataframe with entry removed
    '''
    new_df = df.copy()
    del df

    new_df = new_df.drop(idx).reset_index(drop=True)

    if save_immediately:
        new_df.to_csv('websites.csv',index=False)

    return new_df

def get_title(url):
   '''
   This function reads the html and finds the title, returning it as a string
   '''
   if (url == None) or ('http' not in url):
      raise ValueError(f'Not a useable url: "{url}"')
   try:
      response = urlopen(url);
      soup = BeautifulSoup(response, 'html.parser')
      title = soup.title.get_text()
      del url,soup,response
      return title
   except:
      raise ValueError(f'Not a useable url: "{url}"')
    
def find_all_links(page):
   '''
   Finds all links on page, this is a WIP as it doesn't find all links
   '''
   soup = BeautifulSoup(page,features="lxml")
   all_links = []
   for line in soup.find_all('a'):
      line = line.get('href')
      try:
         if ('http' not in line):
               continue
      except:
         continue
      all_links.append(line)
   del soup,line,page
   return list(set(all_links))

def find_relevant_links_and_titles(all_links,keywords,search_title = False):
   '''
   Given a list of links and a list of keywords this method will find links/titles that contain
   keywords

   all_links: list, contains all links you want to parse

   keywords: list, contains all keywords you want to look for

   search_title: bool, leave this set to false in order to save time, but if you want to search
      titles anyway, set it to true. If set to true, the script will download the html
      of the page, strip the title from it, and then look for keywords. This is a massive
      time waste.
   '''
   name_and_link = []
   keywords_upper = [string.capitalize() for string in keywords]
   for link in all_links:
      signal.alarm(5)
      if search_title:
         try:
               title = get_title(link).lower()
               if any(substring in title for substring in keywords):
                  name_and_link.append((title,link)) 
               else:
                  pass
         except:
               continue
         else: 
               signal.alarm(0)
      else:
         try:
               link = link.lower()
               if any(substring in link for substring in keywords):
                  title = get_title(link)
                  name_and_link.append((title,link))
               else:              
                  pass
         except:
               continue
         else:
               signal.alarm(0)
   del keywords,keywords_upper,link,all_links
   return list(set(name_and_link))

def relevant_links(url,keywords):
   '''
   This method takes a link, strips all links on the page, and finds all links that contain
   keywords

   url: string, link to website you want to parse

   keywords: list, contains all keywords to look for

   returns: list of links that match
   '''

   page = urlopen(url).read()

   all_links = find_all_links(page)

   name_and_link = list(set(find_relevant_links_and_titles(all_links,keywords,search_title=False)))

   return name_and_link

df = pd.read_csv('../websites.csv',header=0)

df

Unnamed: 0,title,url,type,last_checked,path
0,Regional Transportation Authority Bus Routes (...,https://data.nashville.gov/api/views/kaau-hhd5...,csv,1686859000.0,empty
1,Bike Racks (GIS) | Nashville Open Data Portal,https://data.nashville.gov/api/views/yjju-hypq...,csv,1686859000.0,empty
2,Metropolitan Transit Authority Bus Routes (GIS...,https://data.nashville.gov/api/views/22t2-bdkj...,csv,1686859000.0,empty
3,Regional Transportation Authority Bus Stops | ...,https://data.nashville.gov/api/views/p886-fnbd...,csv,1686859000.0,empty
4,Sidewalk Inventory for ADA Self-Assessment | N...,https://data.nashville.gov/api/views/vpxc-b5te...,csv,1686859000.0,empty
5,WeGo Transit and Middle TN RTA Stops and Route...,https://data.nashville.gov/download/2246-gtr4/...,zip,1686859000.0,empty
6,WeGo Transit Bus Stops | Nashville Open Data P...,https://data.nashville.gov/api/views/vfe9-k7vc...,csv,1686859000.0,empty
7,MATA Transit map,https://www.matatransit.com/assets/2/6/MATA_No...,pdf,1686859000.0,empty
8,Chattanooga Traffic Flow | Chattanooga Open Da...,https://www.chattadata.org/api/views/55g4-zatm...,csv,1686859000.0,empty
9,Chattanooga Traffic Flow | Chattanooga Open Da...,https://www.chattadata.org/api/views/v868-x5mt...,csv,1686859000.0,empty


In [4]:
name_link_type = [
    ('KAT | Schedules and Maps','https://katbus.com/wp-content/uploads/2023/02/System-map-8-29-22.pdf','pdf'),
    ('Routes | Lakeway Transit','https://www.lakewaytransit.com/_files/ugd/880354_c0d11605b4c649078a0b98916714f6c8.pdf','pdf'),
    ('Knoxville Bike Map','https://assets.simpleviewinc.com/simpleview/image/upload/v1/clients/knoxville/KnoxBicycle_Map2017_65598a5c-c35f-4fdc-96d2-9fe781752710.pdf','pdf'),
    ('MATA On Time Performance | Memphis Data Hub','https://data.memphistn.gov/api/views/w4fr-mktp/rows.csv?accessType=DOWNLOAD','csv'),
    ('Memfacts MATA On Time Performance','https://data.memphistn.gov/api/views/w4fr-mktp/rows.csv?accessType=DOWNLOAD','csv'),
    ('Go UC Cookeville SATURDAY','https://ucpublictransit.com/wp-content/uploads/2022/12/Go-UC-Cookeville-Complete-w-Saturday-12072022.pdf','pdf'),
]

df = add_to_dataframe(df=df,name_link_type=name_link_type)

df.to_csv(sys.path[0] + '/../websites.csv',index=False)

df

Unnamed: 0,title,url,type,last_checked,path
0,Regional Transportation Authority Bus Routes (...,https://data.nashville.gov/api/views/kaau-hhd5...,csv,1686859000.0,empty
1,Bike Racks (GIS) | Nashville Open Data Portal,https://data.nashville.gov/api/views/yjju-hypq...,csv,1686859000.0,empty
2,Metropolitan Transit Authority Bus Routes (GIS...,https://data.nashville.gov/api/views/22t2-bdkj...,csv,1686859000.0,empty
3,Regional Transportation Authority Bus Stops | ...,https://data.nashville.gov/api/views/p886-fnbd...,csv,1686859000.0,empty
4,Sidewalk Inventory for ADA Self-Assessment | N...,https://data.nashville.gov/api/views/vpxc-b5te...,csv,1686859000.0,empty
5,WeGo Transit and Middle TN RTA Stops and Route...,https://data.nashville.gov/download/2246-gtr4/...,zip,1686859000.0,empty
6,WeGo Transit Bus Stops | Nashville Open Data P...,https://data.nashville.gov/api/views/vfe9-k7vc...,csv,1686859000.0,empty
7,MATA Transit map,https://www.matatransit.com/assets/2/6/MATA_No...,pdf,1686859000.0,empty
8,Chattanooga Traffic Flow | Chattanooga Open Da...,https://www.chattadata.org/api/views/55g4-zatm...,csv,1686859000.0,empty
9,Chattanooga Traffic Flow | Chattanooga Open Da...,https://www.chattadata.org/api/views/v868-x5mt...,csv,1686859000.0,empty
