In [1]:
import os
import re
import requests
import time
from datetime import datetime, date, timedelta as td
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

In [39]:
class SenateScraper():
    def __init__(self, start_date, end_date):
        # date should be a string in the form of m/d/y. for example start_date = "02-14-2012"
        self.start_date = start_date
        self.end_date = end_date
        self.url_start = "https://www.congress.gov/congressional-record/"
        self.url_end = "/senate-section" # change to /"house-section" for congresse speeches
                
    def daterange(self, start_date, end_date):
        """Creates a generator over a list of dates between the start and end date"""
        #http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
        start_date = [int(i) for i in start_date.split("-")]
        start_date = date(start_date[2], start_date[0], start_date[1])
        end_date = [int(i) for i in end_date.split("-")]
        end_date = date(end_date[2], end_date[0], end_date[1])
        for n in range(int ((end_date - start_date).days)):
            #print(start_date + td(n))
            yield(start_date + td(n))
                            
    def get_daily_links(self, url):
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        tds = [td for td in soup.find_all('td')] 
        #only even numbered indexes for txt otherwise it return pdf
        tds = [tds[i] for i in range(len(tds)) if i % 2 == 0]
        links = [urljoin("https://www.congress.gov/", link.a.get('href')) for link in tds] 
        return links
    
    def get_text(self, url):
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        actual_date = re.search(r"\d{4}\/\d{2}\/\d{2}", url).group(0)
        text = soup.find('pre', class_ = 'styled').text
        text = text.replace("_","").replace("-","")
        lines = [l for l in text.splitlines() if len(l) and not l.startswith("[")]  
        #title = lines[1].lstrip()
        text = " ".join(lines[2:]).strip()
        return actual_date, text
    
    def scrape(self):
        with open ("test1.csv","w", newline="", encoding="utf8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=["date","url","text"])
            writer.writeheader()
            dates_to_scrape = [date.strftime("%Y/%m/%d") for date in self.daterange(self.start_date, self.end_date)]
            for date in dates_to_scrape:
                url = self.url_start + date + self.url_end
                links = self.get_daily_links(url)
                if len(links)==0:
                    continue
                for link in links:
                    date, text = self.get_text(link)
                    writer.writerow({"date": date, "url":link, "text":text})

In [40]:
s= SenateScraper(start_date, end_date)
s.scrape()