In [2]:
import urllib.request
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from tqdm import tqdm
import psycopg2
from sqlalchemy import create_engine
import sys
from dbWrapper import dbWrapper

In [3]:
#You will need to store your db credentials in a file in a dictionary named "creds"
exec(open('./db.cred').read())

In [4]:
engine = create_engine('postgresql://{user}:{password}@{host}/{dbname}', echo=False)

In [5]:
db = dbWrapper()

In [3]:
#local_news['file_name'] = local_news['name'].map(lambda x: x.lower().replace(' ', '_') + '.html')

In [4]:
def getRss(file_name):
    try:
        with open('./front_pages/%s' % file_name) as f:
            bs = BeautifulSoup(f, 'html.parser')
            return bs.find_all('link', {'type': 'application/rss+xml'})[0]['href']
    except:
        pass

In [10]:
local_news = db.execute("SELECT DISTINCT ON (name) * FROM local_news WHERE rss_link != 'None'")

In [11]:
# Make all hashes postitive
def posHash(x):
    h = hash(x)
    if h < 0:
          return h + sys.maxsize
    return h

def getArticles(row):
    # this is a bug many sites seem to have--including this line prevents any news articles from being included
    rss_link = row['rss_link'].replace('&k[]=%23topstory', '')
    r = requests.get(rss_link)
    bs = BeautifulSoup(r.text, 'html5lib')
    articles = pd.DataFrame([(item.find('title').text, 
      item.find('pubdate').text,
      item.find('description').text,
      item.find('link').next_element.strip(),
     )
    for item in bs.find_all('item')], columns=['headline', 'pub_date', 'description', 'link'])
    
    articles['pub_date'] = pd.to_datetime(articles['pub_date'])
    articles['pub_name'] = row['name']
    articles['pub_id'] = row['id']
    
    # the primary id for the rss_articles table is the positive 
    # hash of the concatinated pub_name and headline
    articles['id'] = articles.apply(lambda x: posHash(x['pub_name'] + x['headline']), axis=1)
    
    # drop duplicates
    return articles[~articles.duplicated('id', keep=False)]

In [12]:
# Some of the rss links don't include the base url
def addBaseUrl(row):
    if 'http' in row['rss_link']:
            return row['rss_link']
    return row['url'] + row['rss_link']

local_news['rss_link'] = local_news.apply(addBaseUrl, axis=1)

In [423]:
logfile = open('pull_rss.log', 'w')
errorfile = open('pull_rss.error', 'w')

In [431]:
cur.execute("DROP TABLE IF EXISTS rss_articles")
conn.commit()

for i, row in tqdm(enumerate(rss_news.iterrows())):
    try:
        articles = getArticles(row[1])
        if i == 0:
            articles.to_sql('rss_articles', engine, if_exists='replace', index=False)
            cur.execute("ALTER TABLE rss_articles ADD PRIMARY KEY (id)")
            conn.commit()
        else:
            articles.to_sql('rss_articles', engine, if_exists='append', index=False)
        logfile.write("Pulled %d articles for %s\n" % (len(articles), row[1]['name']))
        logfile.flush()
    except:
        errorfile.write("Error pulling articles from %s\n" % row[1]['rss_link'])
        errorfile.flush()

176it [00:59,  2.94it/s]


In [24]:
local_news['rss_link'] = local_news['rss_link'].map(lambda x: x.replace('&k[]=%23topstory', ''))

In [26]:
local_news.iloc[0].rss_link

'http://www.aikenstandard.com/search/?f=rss&t=article&l=50&s=start_time&sd=desc'

In [14]:
import re

In [18]:
x = "https://www.ohio.com/feed"
"http://www.aikenstandard.com/search/?f=rss&t=article&l=50&s=start_time&sd=desc&k[]=%23topstory"

In [21]:
x[:-4]

'https://www.ohio.com/'

In [32]:
len(local_news[local_news['rss_link'].map(getOpinionLink).isnull()])

79

In [46]:
len(local_news[local_news['rss_link'].map(getOpinionLink).isnull()])

39

In [86]:
len(local_news[local_news['rss_link'].map(getOpinionLink).isnull()])

12

In [85]:
def getOpinionLink(rss_link):
    if rss_link[-5:] == "/feed":
        return rss_link[:-4] + "opinion/feed"
    if rss_link[-6:] == "/feed/":
        return rss_link[:-6] + "opinion/feed"   
    if '?f=rss' in rss_link:
        return rss_link + "&c=opinion"
    if 'rss2.0.xml' in rss_link:
        return rss_link.replace('rss2.0.xml', 'opinion/rss2.0.xml')
    if '?feed=' in rss_link:
        ix = rss_link.index('?feed=')
        return rss_link[:ix] + '?feed=opinion'
    if '?widgetName' in rss_link:
        ix = rss_link.index('?widgetName')
        return rss_link[:ix] + 'opinion/' + rss_link[ix:]
    if '/rss/headlines' in rss_link:
        ix = rss_link.index('/rss/headlines')
        return rss_link[:ix] + '/rss/headlines/opinion'
    if 'thetimes-tribune' in rss_link:
        return 'http://thetimes-tribune.com/cmlink/opinion-from-thetimes-tribune-com-1.8278'
    return None

In [93]:
local_news[local_news['rss_link'].map(getOpinionLink).isnull()].iloc[-1]['rss_link']

'http://www.thetimes-tribune.com/cmlink/adam-ripppon-1.2291384'

In [60]:
x = local_news[local_news['rss_link'].map(getOpinionLink).isnull()]

In [68]:
x = 'http://www.charlotteobserver.com/?widgetName=rssfeed&widgetContentId=8167599&getXmlFeed=true'

In [71]:
x[33:]

'?widgetName=rssfeed&widgetContentId=8167599&getXmlFeed=true'

In [69]:
x.index('?widgetName')

33

In [None]:
x.indexOf()

In [57]:
re.findall('\?feed=([A-Za-z\d_-]*)', x)

['news_top5']