# San Diego County News Parser
This code pulls rss feeds from San Diego Count News Center (official communications channel of San Diego County) and parses it for covid announcements.

In [1]:
import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import pickle
import json

In [2]:
#### Functions for getting archived url lists
def get_page_links(article_main_object):
    linklist = []
    articlelist = article_main_object.findAll("article")
    for eacharticle in articlelist:
        headerobject = eacharticle.findAll("h2", class_="entry-title card-title")
        postlink = headerobject[0].findAll("a")
        linklist.append(postlink[0].get("href"))
    return linklist


def get_lastpage(article_main_object):
    pageobject = article_main_object.findAll("div", class_="pagination")
    lastpage = pageobject[0].findAll("a", class_="page-numbers")[-2].contents[1]
    return lastpage


def get_archive_links(baseurl, daterange):
    archivelinks = []
    for eachdate in daterange:
        rawresult = requests.get(baseurl + eachdate)
        archivemain = BeautifulSoup(rawresult.text, "html.parser")
        archivelinks.extend(get_page_links(archivemain))
        lastpage = get_lastpage(archivemain)
        i = 2
        while i < int(lastpage) + 1:
            rtmp = requests.get(baseurl + eachdate + "page/" + str(i) + "/")
            tmpresult = BeautifulSoup(rtmp.text, "html.parser")
            archivelinks.extend(get_page_links(tmpresult))
            i = i + 1
            time.sleep(0.5)
    return archivelinks


#### Functions for getting article content
def get_authors(mainarticle):
    authors = []
    authorlist = mainarticle.findAll("span", class_="author vcard")
    for eachauthor in authorlist:
        basic_author = eachauthor.text.split(", ")
        authorname = basic_author[0]
        affiliation = basic_author[-1]
        nameparts = authorname.split(" ")
        tmpdict = {
            "type": "@Person",
            "name": authorname,
            "affiliation": {"name": affiliation},
        }
        if len(nameparts) == 3:
            tmpdict["givenName"] = nameparts[0] + " " + nameparts[1]
            tmpdict["familyName"] = nameparts[2]
        elif len(nameparts) == 2:
            tmpdict["givenName"] = nameparts[0]
            tmpdict["familyName"] = nameparts[1]
        authors.append(tmpdict)
    return authors


def get_keywords(mainarticle):
    keywordprops = mainarticle.findAll("meta", {"property": "article:tag"})
    keywords = []
    for eachentry in keywordprops:
        keywords.append(eachentry.get("content"))
    return keywords


def get_basic_info(mainarticle):
    article_title = mainarticle.find("meta", {"property": "og:title"}).get("content")
    article_description = mainarticle.find("meta", {"property": "og:description"}).get(
        "content"
    )
    try:
        datePublished = mainarticle.find(
            "meta", {"property": "article:published_time"}
        ).get("content")
    except:
        datePublished = mainarticle.find("meta", {"property": "og:updated_time"}).get(
            "content"
        )
    try:
        dateModified = mainarticle.find(
            "meta", {"property": "article:modified_time"}
        ).get("content")
    except:
        dateModified = mainarticle.find("meta", {"property": "og:updated_time"}).get(
            "content"
        )
    url = mainarticle.find("meta", {"property": "og:url"}).get("content")
    authors = get_authors(mainarticle)
    keywords = get_keywords(mainarticle)
    tmpdict = {
        "name": article_title,
        "description": article_description,
        "datePublished": datePublished,
        "url": url,
        "keywords": keywords,
        "author": authors,
        "dateModified": dateModified,
    }
    return tmpdict


def get_other_meta(mainarticle):
    articletype = mainarticle.find("meta", {"property": "og:type"}).get("content")
    article = mainarticle.find("div", class_="entry-content").text
    return {"articleType": articletype, "articleContent": article}


## Functions for getting new posts
def get_update_urls():
    newsfeed = feedparser.parse("https://www.countynewscenter.com/news/rss")
    newsresults = []
    for eachentry in newsfeed.entries:
        newsresults.append(eachentry["id"])
    return newsresults


## Functions for parsing results
def parse_page(linklist):
    failures = []
    baseurl = "https://www.countynewscenter.com/"
    location = {"name": "San Diego County", "_id": "USA_US-CA_06073"}
    context = {
        "@type": "SpecialAnnouncement",
        "@context": {
            "schema": "http://schema.org/",
            "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
            "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
            "bts": "http://schema.biothings.io/",
            "xsd": "http://www.w3.org/2001/XMLSchema#",
            "owl": "http://www.w3.org/2002/07/owl/",
            "niaid": "https://discovery.biothings.io/view/niaid/",
            "outbreak": "https://discovery.biothings.io/view/outbreak/",
        },
    }
    for eachlink in linklist:
        try:
            rawresult = requests.get(eachlink)
            mainarticle = BeautifulSoup(rawresult.text, "html.parser")
            basicdict = context.copy()
            basicdict.update(get_basic_info(mainarticle))
            basicdict.update(get_other_meta(mainarticle))
            basicdict["location"] = location
            timestring = basicdict["datePublished"].split("T")[1].split("-")[0]
            datestring = basicdict["datePublished"].split("T")[0]
            basicdict["_id"] = (
                basicdict["location"]["_id"].replace("-", ".")
                + "_"
                + datestring.replace("-", "")
                + "."
                + timestring.replace(":", ".")
            )
            with open("data/archive/" + basicdict["_id"] + ".json", "w") as json_file:
                json.dump(basicdict, json_file)
        except:
            failures.append(eachlink)
        time.sleep(0.5)
    return failures


#### Functions for getting most recent 10 results
def run_update():
    new_results = get_update_urls()  ## Get new urls
    archivedlinks = pickle.load(open("data/link_list.txt", "rb"))  ## Load previous urls
    new_urls = [
        x for x in new_results if x not in archivedlinks
    ]  ## Check if they've been run before
    parse_page(new_urls)  ## Process the new urls
    ## update previous list
    all_links = new_urls + archivedlinks
    with open("data/link_list.txt", "wb") as dmpfile:
        pickle.dump(all_links, dmpfile)

In [3]:
#### Main script
# daterange = ['2020/01/','2020/02/','2020/03/','2020/04/','2020/05/','2020/06/','2020/07/','2020/08/']
daterange = ["2020/06/"]
baseurl = "https://www.countynewscenter.com/"
location = {"name": "San Diego Count", "_id": "USA_US-CA_06073"}

In [7]:
archivedlinks = get_archive_links(baseurl, daterange)
# print(len(archivedlinks))
with open("data/link_list.txt", "wb") as dmpfile:
    pickle.dump(archivedlinks, dmpfile)

In [3]:
failures = parse_page(archivedlinks)

NameError: name 'archivedlinks' is not defined

## Run an update

In [3]:
## Get new urls
new_results = get_update_urls()

## Load previous urls
archivedlinks = pickle.load(open("data/link_list.txt", "rb"))

## Check if they've been run before
new_urls = [x for x in new_results if x not in archivedlinks]

## Process the new urls
parse_page(new_urls)

## update previous list
all_links = new_urls + archivedlinks
with open("data/link_list.txt", "wb") as dmpfile:
    pickle.dump(all_links, dmpfile)

In [3]:
run_update()

### Unit tests / draft code

In [None]:
newsfeed = feedparser.parse("https://www.countynewscenter.com/news/rss")
entry = newsfeed.entries[1]

print(entry.keys())
print(entry["published"])
print(entry["id"])
print(entry["tags"])
print(entry["summary"])

In [None]:
rawresult = requests.get(baseurl + daterange[0])
archivemain = BeautifulSoup(rawresult.text, "html.parser")
articlelist = archivemain.findAll("article")
pageobject = archivemain.findAll("div", class_="pagination")

In [None]:
r = requests.get(
    "https://www.countynewscenter.com/housing-assistance-for-those-in-need/"
)
mainarticle = BeautifulSoup(r.text, "html.parser")
# datePublished = mainarticle.find("meta",{"property":"article:published_time"}).get("content")
basicdict.update(get_basic_info(mainarticle))
print(basicdict)

In [None]:
print(mainarticle.find("div", class_="entry-content").text)

In [24]:
eachitem = newslist[0]

/mayor/news/releases/san-diego-secures-28m-state-project-homekey-funds-create-new-homeless-housing


In [32]:
def get_mayor_urls(year):
    baseurl = "https://www.sandiego.gov"
    rawresult = requests.get("https://www.sandiego.gov/mayor/news/releases" + str(year))
    archivemain = BeautifulSoup(rawresult.text, "html.parser")
    contentmain = archivemain.find("main")
    newslist = contentmain.findAll("div", class_="twelve columns")
    linklist = []
    for eachitem in newslist:
        url = eachitem.find("a").get("href")
        datePublished = eachitem.find("p", class_="date").text
        name = eachitem.find("a").text
        linklist.append(
            {"name": name, "datePublished": datePublished, "url": baseurl + url}
        )
    return linklist

In [33]:
linklist = get_mayor_urls(2020)
print(linklist)

[{'name': 'San Diego Secures $28M in State “Project Homekey” Funds to Create New Homeless Housing', 'datePublished': '08/26/2020', 'url': 'https://www.sandiego.gov/mayor/news/releases/san-diego-secures-28m-state-project-homekey-funds-create-new-homeless-housing'}, {'name': 'San Diego Recognizing 100th Anniversary of Women’s Right to Vote by Illuminating Local Landmarks', 'datePublished': '08/23/2020', 'url': 'https://www.sandiego.gov/mayor/news/releases/san-diego-recognizing-100th-anniversary-women’s-right-vote-illuminating-local-landmarks'}, {'name': 'Mayors Faulconer And Liccardo Issue Joint Statement Urging Californians To Avoid Exodus Of Ride-Share Companies', 'datePublished': '08/19/2020', 'url': 'https://www.sandiego.gov/mayor/news/releases/mayors-faulconer-and-liccardo-issue-joint-statement-urging-californians-avoid-exodus-ride'}, {'name': 'City Opens “Cool Zones” As Temperatures Soar Across Region', 'datePublished': '08/18/2020', 'url': 'https://www.sandiego.gov/mayor/news/rele

In [48]:
rawresult = requests.get(linklist[0]["url"])
archivemain = BeautifulSoup(rawresult.text, "html.parser")
description = archivemain.find("meta", attrs={"name": "description"}).get("content")
title = archivemain.find("title").text.split(" | ")
author = {
    "@type": "Organization",
    "name": "Office of " + title[1],
    "affiliation": {"name": title[2]},
}
contentmain = archivemain.find("main")
articleType = "press release"
articleContent = contentmain.text
print(author)
print(description)
print("======================")

print(contentmain)

{'@type': 'Organization', 'name': 'Office of Mayor Kevin L. Faulconer', 'affiliation': {'name': 'City of San Diego Official Website'}}
Wednesday, August 26, 2020 - NEWS RELEASE San Diego - With the goal of creating additional housing units for San Diegans experiencing homelessness, the City of San Diego and San Diego Housing Commission (SDHC) were notified Tuesday that the State of California has reserved $27.7 million in Project Homekey Program funds for purchasing a hotel to transform it into permanent
<main>
<div class="nine columns no-gutters">
<div class="entry__content l-padding-desktop-am l-padding-mobile-bd">
<h1>San Diego Secures $28M in State “Project Homekey” Funds to Create New Homeless Housing</h1> <h2><div class="field field-name-field-subtitle field-type-text field-label-hidden"><div class="field-items"><div class="field-item even">City, Housing Commission to Transform Hotel into Permanent Housing Units Accompanied by County-Funded Services</div></div></div></h2> <div cl