Update scraper.py

chadskelton · Jun 26, 2019 · 9951627 · 9951627
1 parent b1901e3
commit 9951627
Showing 1 changed file with 16 additions and 8 deletions.
diff --git a/scraper.py b/scraper.py
@@ -145,7 +145,8 @@ def scrape_bcca(url):
 
 def scrape_bcpc(url):
 
-        html = requests.get(url, verify=False, headers={'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'})
+        # html = requests.get(url, verify=False, headers={'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'})
+        html = requests.get(url)
         # verify=False because getting 502 errors due to invalid certificate
         htmlpage = html.content
 
@@ -161,19 +162,25 @@ def scrape_bcpc(url):
         br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36')]
         html = br.open(url)
         htmlpage = html.content
-        '''
-
-        soup = BeautifulSoup(htmlpage)
-
-        print soup
-
+
         table = soup.find ("div", {"class" : "view-content"})
         
         print table
     
         decisions = table.findAll ("a")
         
         # decisions = soup.findAll ("div", {"class":"views-field views-field-text"})
+        '''
+
+        # new instructions for canlii site
+
+        soup = BeautifulSoup(htmlpage)
+
+        print soup
+
+        section = soup.find ("div", {"id" : "recentJudg"})
+
+        decisions = section.findAll ("a")        
 
         print decisions
 
@@ -196,6 +203,7 @@ def scrape_bcpc(url):
     print 'Difficulty scraping BCCA'
 
 #try:
-scrape_bcpc("http://www.provincialcourt.bc.ca/judgments-decisions")
+#scrape_bcpc("http://www.provincialcourt.bc.ca/judgments-decisions")
+scrape_bcpc("https://www.canlii.org/en/bc/bcpc/")
 #except:
 #    print 'Difficulty scraping BCPC'