In [1]:
from datetime import timedelta, datetime
from pandas.tseries.holiday import USFederalHolidayCalendar

class LastDate:
    
    def __init__(self):
        now = datetime.today()
        self.current_year = now.year
        self.current_day = now.replace(hour=0, minute=0, second=0, microsecond=0)
        self.cal = USFederalHolidayCalendar()

    def _most_recent_workday(self, last_day):
        #returns most recent workday, accounting for federal holidays and weekends
        self.holidays = self.cal.holidays(datetime(self.input_year, 1, 1), datetime(self.input_year, 12, 31)).to_pydatetime()
        if self.search_date.weekday() < 5 and self.search_date not in self.holidays:
            pass
        elif self.search_date.weekday() < 5 and self.search_date in self.holidays:
            self.search_date = (self.search_date - timedelta(days = 1))
            if self.search_date.weekday() < 5:
                pass
            else:
                difference = (self.search_date.weekday() - 4)
                self.search_date = (self.search_date - timedelta(days=difference))
        else:
            difference = (self.search_date.weekday() - 4)
            self.search_date = (self.search_date - timedelta(days=difference))
        return self.search_date

    # function returns most recent weekday with an option to manually override
    def date_to_search(self, year, subtract_days = 1):
        if len(str(year)) != 4:
            return Exception('year format must conform to XXXX')
        else:
            pass
        self.input_year = year
        if self.input_year == self.current_year:
            self.search_date = (self.current_day - timedelta(days=subtract_days))
            return self._most_recent_workday(self.search_date)
        else:
            self.search_date = (datetime(self.input_year, 12, 31) - timedelta(days=(subtract_days - 1)))
            return self._most_recent_workday(self.search_date)

In [2]:
test_date = LastDate()
test_date.date_to_search(2020)

datetime.datetime(2020, 11, 10, 0, 0)

In [2]:
import requests
from lxml import html
from fake_useragent import UserAgent

class RequestsPage:
        
    def get_html(self, url, payload=None, fakeuser=None):
        self.url = url
        if fakeuser:
            fakeuser = UserAgent()
            headers = {'user-agent': fakeuser.chrome}
            data = requests.get(url, headers=headers, params=payload)
        else:
            data = requests.get(url, params=payload)
        return data

    def save_html(self, data, filename):
        self.file = open(filename, "a")
        self.file.write(data)
        self.file.close()

In [3]:
#https://www.oscn.net/applications/oscn/report.asp?report=DailyFilings&errorcheck=true&database=&db=Canadian&StartDate=09%2F04%2F20

class OklahomaURLs:
    search_page = 'https://www.oscn.net/applications/oscn/report.asp'
    case_details = 'https://www.oscn.net/dockets/GetCaseInformation.aspx'

In [4]:
class SearchPageMixIn():

    lastdate = LastDate()
    
    def _county_specific_selenium_search(self):
        return exception('County is not supported. Are you a developer? Did you forget to provide county-specific steps for searching?')
    
    def _get_last_workday(self, subtract_days = 1):
        self.url_date = self.lastdate.date_to_search(self.year, subtract_days = subtract_days).strftime("%m/%d/%y")

    def _subtract_another_day(self, subtract_days = 1):
        self.subtract_days = self.subtract_days + subtract_days

    def _parse_case_table(self):
        if self.driver != None:
            try:    
                self.case_rows = self.driver.find_elements(*self.locator)
            except TypeError:
                raise TypeError('No locator specified for table rows.')
        else:
            self.parser = etree.HTMLParser()
            self.tree = etree.parse(StringIO(self.output.text), self.parser)
            self.case_rows = self.tree.xpath(f'//*[contains(text(), "{self.case_prefix}-")]')
        self.case_number = self.case_rows[len(self.case_rows)-1].text
        return self.case_number

    def _search_previous_day_until_success(self):
        result = None
        while result is None:
            try:
                print('subtracting another day')
                self._subtract_another_day()
                print('getting next workday')
                self._get_last_workday(subtract_days = self.subtract_days)
                print('calling new html search')
                if self.driver != None:
                    self._county_specific_selenium_search()
                else:
                    self.output = self.get_html(self.url, payload = self.payload)
                self.output = self.get_html(self.url, payload = self.payload)
                print(f'attempting to parse case table for {self.url_date}')
                result = self._parse_case_table()
            except IndexError:
                pass  
        return result

    def most_recent_case(self, county, year, case_prefix, session=None, driver=None, locator=None):
        self.county = county
        self.year = year
        self.case_prefix = case_prefix
        self.driver = driver
        self.locator = locator
        self.session = session
        self._get_last_workday()
        if self.driver != None:
            self._county_specific_selenium_steps()
        else:
            self.output = self.get_html(self.url, payload = self.payload) 
        try:
            return self._parse_case_table()
        except IndexError:
            return self._search_previous_day_until_success()

In [6]:
#from datetime import datetime, timedelta, date
from lxml import etree
from io import StringIO, BytesIO

class SearchPage(RequestsPage, SearchPageMixIn):
    
    url = OklahomaURLs.search_page
        
    def _get_last_workday(self, subtract_days = 1):
        #self.url_date = self.lastdate.date_to_search(self.year, subtract_days = subtract_days).strftime("%m/%d/%y")
        super()._get_last_workday()
        self.payload = {
            'report' : 'DailyFilings',
            'errorcheck' : 'true',
            'database' : '',
            'db' : self.county,
            'StartDate' : self.url_date
                       }
    
   

In [18]:
searchpage = SearchPage()
searchpage.most_recent_case('tulsa', 2013, 'SC')

'SC-2013-21179'

In [84]:
searchpage.year

2019

In [98]:
searchpage.url_date

'09/14/20'

In [9]:
class CaseDetails(RequestsPage):
    
    url = OklahomaURLs.case_details
    
    def _build_payload(self):
        self.payload = {
            'db' : self.county,
            'number' : self.case_number
                       }
    
    def page_source(self, county, case_number):
        self.county = county
        self.case_number = case_number
        self._build_payload()
        self.output = self.get_html(self.url, payload = self.payload)
        

In [10]:
casedetails = CaseDetails()   
casedetails.page_source('tulsa', 'sc-2019-3899')
casedetails.output.text

'<!DOCTYPE html>\n<!--[if lt IE 7]>      <html lang="en-US" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->\n<!--[if IE 7]>         <html lang="en-US" class="no-js lt-ie9 lt-ie8"> <![endif]-->\n<!--[if IE 8]>         <html lang="en-US" class="no-js lt-ie9"> <![endif]-->\n<!--[if gt IE 8]><!--> <html lang="en-US" class="no-js"> <!--<![endif]-->\n    <head>\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n        <title>\r\n    OSCN Case Details\r\n  </title>\n        <meta name="description" content="">\n        <meta name="viewport" content="width=device-width, initial-scale=1">\n\n\n        <link rel="stylesheet" href="/assets/css/normalize.min.css">\n        <link rel="stylesheet" href="/assets/css/main.css">\n        <link rel="stylesheet" href="/assets/css/bootstrap-2.3.2.min.css">\n        <link rel="stylesheet" href="/assets/css/oscn-navigation.css">\n        <link rel="stylesheet" href="/assets/css/oscn-footer.css">\n\n<!-- /STYLES