In [3]:
import pandas as pd
speech = pd.read_csv("speeches.csv")
speech

Unnamed: 0,date,url,title,text
0,2019/01/02,https://www.congress.gov/congressional-record/...,SENATE BILL REFERRED,A bill of the Senate of the following title wa...
1,2019/01/02,https://www.congress.gov/congressional-record/...,ADJOURNMENT,The SPEAKER pro tempore. Pursuant to section 7...
2,2019/01/02,https://www.congress.gov/congressional-record/...,"EXECUTIVE COMMUNICATIONS, ETC.","Under clause 2 of rule XIV, executive communic..."
3,2019/01/02,https://www.congress.gov/congressional-record/...,THE JOURNAL,The SPEAKER pro tempore. Pursuant to section 2...
4,2019/01/02,https://www.congress.gov/congressional-record/...,PLEDGE OF ALLEGIANCE,The SPEAKER pro tempore. The Chair will lead t...
...,...,...,...,...
1043,2019/01/17,https://www.congress.gov/congressional-record/...,By Mr. VISCLOSKY:,H.R. 684. Congress has the power to ena...
1044,2019/01/17,https://www.congress.gov/congressional-record/...,By Mr. WALDEN:,H.R. 685. Congress has the power to ena...
1045,2019/01/17,https://www.congress.gov/congressional-record/...,By Mrs. WATSON COLEMAN:,H.R. 686. Congress has the power to ena...
1046,2019/01/17,https://www.congress.gov/congressional-record/...,By Mrs. WATSON COLEMAN:,H.R. 687. Congress has the power to ena...


In [5]:
speech.loc[90,"text"]

"The SPEAKER pro tempore. Pursuant to clause 12(a) of rule I, the  Chair declares the House in recess until 2 p.m. today.   Accordingly (at 12 o'clock and 18 minutes p.m.), the House stood in  recess."

In [None]:
# based loosely on https://github.com/jchaskell/scraper-cr

In [15]:
import math


def entropy(string):
    "Calculates the Shannon entropy of a string"

    # get probability of chars in string
    prob = [ float(string.count(c)) / len(string) for c in dict.fromkeys(list(string)) ]

    # calculate the entropy
    entropy = - sum([ p * math.log(p) / math.log(2.0) for p in prob ])

    return entropy

In [1]:
import os, re, requests, time
from datetime import datetime, date, timedelta as td
from bs4 import BeautifulSoup
from time import sleep
from urllib.parse import urljoin
import csv
from tqdm import tqdm
import logging
logging.basicConfig(filename='speeches.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
start_date = "01-01-2008"
end_date = "01-31-2008"

In [3]:
def daterange(start_date, end_date):
    """Creates a generator over a list of dates between the start and end date"""
        #borrowed from: http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
    start_date = [int(i) for i in start_date.split("-")]
    start_date = date(start_date[2], start_date[0], start_date[1])
    end_date = [int(i) for i in end_date.split("-")]
    end_date = date(end_date[2], end_date[0], end_date[1])
    for n in range(int ((end_date - start_date).days)):
         yield(start_date + td(n))

In [4]:
def scraper():
    urls = []
    url_beg = "https://www.congress.gov/congressional-record/"
    url_end = "/senate-section"
    dates_to_scrape = [single_date.strftime("%Y/%m/%d") for single_date in daterange(start_date, end_date)]
    for date in dates_to_scrape:
        urls.append(url_beg + date + url_end)
        #links = get_links(main_url)
    return urls

In [5]:
def get_links(lst_urls):
    """Gets links for one day of Senate CR"""
    # print(url)
    urls = [url for url in lst_urls]
    links = []
    for url in urls:
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        tds = [td for td in soup.find_all('td')] #only even numbered indexes
        tds_relevant = [tds[i] for i in range(len(tds)) if i % 2 == 0]
        for link in tds_relevant:
            links.append(urljoin("https://www.congress.gov/", link.a.get('href')))
    return links

In [6]:
lst = get_links(scraper())


In [7]:
len(lst)

451

In [41]:
def scrape_content(lst_links):
    with open ("speeches_from_2002.csv","w", newline="", encoding="utf8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["date","url", "title","text"])
        writer.writeheader()
        lst_links = get_links(scraper())
        for url in tqdm(lst_links):
            date = re.search(r"\d{4}\/\d{2}\/\d{2}", url).group(0)
            page = requests.get(url)
            soup = BeautifulSoup(page.content)
            text = soup.find('pre', class_ = 'styled').text
            text = text.replace("_","").replace("-","")
            lines = [l for l in text.splitlines() if len(l) and not l.startswith("[")]  
            title = lines[1].lstrip()
            text = " ".join(lines[2:]).strip()
            writer.writerow({"date": date, "url":url, "title":title,"text":text})

In [42]:
scrape_content(get_links(scraper()))


  0%|                                                                                        | 0/96089 [00:00<?, ?it/s][A
  0%|                                                                             | 1/96089 [00:00<17:10:59,  1.55it/s][A
  0%|                                                                             | 2/96089 [00:01<17:06:16,  1.56it/s][A
  0%|                                                                             | 3/96089 [00:02<18:02:50,  1.48it/s][A
  0%|                                                                             | 4/96089 [00:02<18:00:59,  1.48it/s][A
  0%|                                                                             | 5/96089 [00:03<17:57:23,  1.49it/s][A
  0%|                                                                             | 6/96089 [00:03<17:31:21,  1.52it/s][A
  0%|                                                                             | 7/96089 [00:04<17:49:36,  1.50it/s][A
  0%|          

KeyboardInterrupt: 

In [17]:
import pandas as pd

In [18]:
speech = pd.read_csv("speeches.csv")
speech

Unnamed: 0,date,url,title,text
0,2019/01/02,https://www.congress.gov/congressional-record/...,Senate,The Senate met at 4 p.m. and was called to ord...
1,2019/01/02,https://www.congress.gov/congressional-record/...,PRAYER,"The Chaplain, Dr. Barry C. Black, offered the ..."
2,2019/01/02,https://www.congress.gov/congressional-record/...,PLEDGE OF ALLEGIANCE,The Presiding Officer led the Pledge of Allegi...
3,2019/01/02,https://www.congress.gov/congressional-record/...,APPOINTMENT OF ACTING PRESIDENT PRO TEMPORE,The PRESIDING OFFICER. The clerk will please r...
4,2019/01/02,https://www.congress.gov/congressional-record/...,RESERVATION OF LEADER TIME,The ACTING PRESIDENT pro tempore. Under the pr...
...,...,...,...,...
577,2019/01/30,https://www.congress.gov/congressional-record/...,AMENDMENTS SUBMITTED AND PROPOSED,SA 81. Mr. BLUMENTHAL submitted an amendment i...
578,2019/01/30,https://www.congress.gov/congressional-record/...,TEXT OF AMENDMENTS,SA 81. Mr. BLUMENTHAL submitted an amendment i...
579,2019/01/30,https://www.congress.gov/congressional-record/...,"ORDERS FOR THURSDAY, JANUARY 31, 2019","Mr. McCONNELL. Mr. President, I ask unanimous ..."
580,2019/01/30,https://www.congress.gov/congressional-record/...,ADJOURNMENT UNTIL 10 A.M. TOMORROW,"Mr. McCONNELL. Mr. President, if there is no f..."


In [28]:
speech[speech.date=="2019/01/02"].shape

(49, 4)

In [22]:
"https://www.congress.gov/congressional-record/2019/1/2/senate-section/article/S8052-3" in speech.url

False

In [20]:
speech.url.loc[9]

'https://www.congress.gov/congressional-record/2019/01/02/senate-section/article/S8052-2'

In [14]:
df = pd.read_csv(Path.joinpath(path, "coca_news.txt"), delimiter="\t", header=None, 
                 names =["textID","words", "Year","Newspaper", "Date (yyyyddmm)", "Title", "Section"])

In [16]:
df.to_csv("add_to_news.csv")

In [10]:
path = Path(r"C:\Users\danny\Downloads")

In [7]:
import pandas as pd
from pathlib import Path

In [3]:
results = pyreadr.read_r(r"C:\Users\danny\OneDrive\Desktop\Senate_104-113.RData")

LibrdataError: Unable to read from file

In [1]:
import pyreadr

In [129]:
re.search(r"\d{4}\/\d{2}\/\d{2}", st).group(0)

'2014/01/06'

In [109]:
st = 'https://www.congress.gov/congressional-record/2014/01/06/senate-section/article/S3-2'

In [None]:
def main(args):
    """Initializes class and runs through the functions"""
    directory = args[0]

    #create dates
    start_date = args[1].split("-")
    start_date = date(int(start_date[2]), int(start_date[0]), int(start_date[1]))
    if len(args) == 3:
        end_date = args[2].split("-")
    else:
        end_date = time.strftime("%d-%m-%Y").split("-")
    end_date =date(int(end_date[2]), int(end_date[0]), int(end_date[1]))
    scrape = scrapeCR(start_date, end_date, directory)
    scrape.scrape()

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Too few arguments")
        sys.exit()
    else:
        args = sys.argv[1:]
        main(args)




In [6]:
scrapeCR("01-01-2021", "03-01-2021", os.getcwd())

<__main__.scrapeCR at 0x209dd36f518>