In [1]:
import pickle
import pandas
from collections import Counter, defaultdict
import operator
import utils
from geopy.geocoders import Nominatim
from newspaper import Article
import requests
from urllib.parse import urlencode, quote_plus
import time

In [2]:
import classes

In [3]:
urls_and_paths = [('frames/children_killed', 'http://www.gunviolencearchive.org/children-killed'),
                  ('frames/children_injured', 'http://www.gunviolencearchive.org/children-injured'),
                  ('frames/teens_killed', 'http://www.gunviolencearchive.org/teens-killed'),
                  ('frames/teens_injured', 'http://www.gunviolencearchive.org/teens-injured'),
                  ('frames/accidental_deaths', 'http://www.gunviolencearchive.org/accidental-deaths'),
                  ('frames/accidental_injuries', 'http://www.gunviolencearchive.org/accidental-injuries'),
                  ('frames/accidental_deaths_children', 'http://www.gunviolencearchive.org/accidental-child-deaths'),
                  ('frames/accidental_injuries_children', 'http://www.gunviolencearchive.org/accidental-child-injuries'),
                  ('frames/accidental_deaths_teens', 'http://www.gunviolencearchive.org/accidental-teen-deaths'),
                  ('frames/accidental_injuries_teens', 'http://www.gunviolencearchive.org/accidental-teen-injuries'),
                  ('frames/officer_involved_shootings', 'http://www.gunviolencearchive.org/officer-involved-shootings'),
                  ('frames/mass_shootings_2013', 'http://www.gunviolencearchive.org/reports/mass-shootings/2013'),
                  ('frames/mass_shootings_2014', 'http://www.gunviolencearchive.org/reports/mass-shootings/2014'),
                  ('frames/mass_shootings_2015', 'http://www.gunviolencearchive.org/reports/mass-shootings/2015'),
                  ('frames/mass_shootings', 'http://www.gunviolencearchive.org/mass-shooting')]
CORPUS_NAME = 'the_violent_corpus'

In [4]:
frames = []
for df_path, url in urls_and_paths:
    with open(df_path, 'rb') as infile:
        df = pickle.load(infile)
        frames.append(df)
df = pandas.concat(frames)
len(df)

5485

In [5]:
def get_sources(dataframe):
    """

    :param dataframe:
    :return:
    """
    sources = set()
    for index, row in dataframe.iterrows():
        sources.add(row['source_url'])
        sources.update(row['incident_sources'])
    return sources

In [6]:
def generate_archive_uri(article_uri):
    archive_api='http://archive.org/wayback/available?'
    params={'url': article_uri}
    encoded_uri=archive_api + urlencode(params)
    print(encoded_uri)
    r=requests.get(encoded_uri)
    j=r.json()
    try:
        closest=j['archived_snapshots']['closest']
        if all([closest['available'], closest['status']=='200']):
            return closest['url']
    except:
        return ''

archive_uri=generate_archive_uri('http://www.wdam.com/story/27682998/1-dead-4-injured-in-nightclub-shooting')

http://archive.org/wayback/available?url=http%3A%2F%2Fwww.wdam.com%2Fstory%2F27682998%2F1-dead-4-injured-in-nightclub-shooting


In [7]:
def download_and_parse_uri(url):
    language='en'
    a=Article(url, language)
    a.download()
    a.parse()
    title=a.title
    content=a.text
    dct=a.publish_date or a.meta_data['date']

    news_item=classes.NewsItem(
        title=title,
        content=content,
        dct=dct
    )
    return news_item

item=download_and_parse_uri(archive_uri)

In [None]:
print(item.dct)

2014/12/21


In [None]:
sources=get_sources(df)
for source in sources:
    if source!='':
        archive_uri=generate_archive_uri(source)
        if archive_uri:
            item=download_and_parse_uri(archive_uri)
            print(item.title)
            input('continue?')
            time.sleep(10)
        else:
            print("No archive.org versions found!")

http://archive.org/wayback/available?url=http%3A%2F%2Fpix11.com%2F2016%2F10%2F30%2F2-dead-in-newburgh-shooting-gunman-at-large-police%2F
2 dead, 5 injured in Newburgh Halloween party shooting
continue?
http://archive.org/wayback/available?url=http%3A%2F%2Fwww.khou.com%2Fnews%2Fcrime%2Frobbery-suspect-shot-at-raising-canes-in-the-woodlands%2F355202865
Deputy kills Raising Cane's robbery suspect; 2 customers hurt
continue?
http://archive.org/wayback/available?url=http%3A%2F%2Fwww.ksla.com%2Fstory%2F29517923%2Favoyelles-parish-man-kills-7-year-old-daughter-himself
Avoyelles Parish man kills 7-year-old daughter, himself - KSLA News 12 Shreveport, Louisiana News Weather & Sports
