# Scrape Article Data from the ProQuest Historical Newspapers™ Archive Using Scrapy

#### Scrapy Structure/Lingo:
**Spiders** extract data **items**, which Scrapy send one by one to a configured **item pipeline** (if there is possible) to do post-processing on the items.)

## Import relevant packages...

In [3]:
import datetime
from datetime import timedelta
import csv
import numpy as np

import scrapy
import math
import logging
import json
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Item, Field
from scrapy.selector import Selector

from scrapy.utils.response import open_in_browser
import itertools
import re
import os

datapath = ''

## Initial variables...

In [4]:
## add helpers to python path
    
# defines form of scraped items
class ArticleItem(scrapy.Item):
    
    # defined by database
    databaseindex = scrapy.Field()
    originalquery = scrapy.Field()
    originalstart = scrapy.Field()
    originalend = scrapy.Field()
    location = scrapy.Field()
    
    # defined by the search
    resultscount = scrapy.Field()
    query = scrapy.Field()
    querystart = scrapy.Field()
    queryend = scrapy.Field()
    parents = scrapy.Field()
    
    # defined by item itself
    searchindex = scrapy.Field()
    title = scrapy.Field()
    info = scrapy.Field()
    link  = scrapy.Field()
    
    # derived from those above
    daysFrom = scrapy.Field()
    
# stores all scraped items into a single items.jl file
class JsonWriterPipeline(object):

    # operations performed when spider starts
    def open_spider(self, spider):
        self.file = open(os.path.join(datapath, 'articles/articles.jl'), 'a')

    # when the spider finishes
    def close_spider(self, spider):
        self.file.close()

    # when the spider yields an item
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

# download current archive and set it up for easy scanning
try:
    articles = []
    with open(os.path.join(datapath, 'articles/articles.jl')) as f:
        for line in f:
            articles.append(json.loads(line))
    articles = sorted(articles, key = lambda element: (int(element['databaseindex']), int(element['searchindex'])))
    databaseindices = np.array([a['databaseindex'] for a in articles])
    articles = np.array(articles)
except FileNotFoundError:
    articles = None
    
# number of keywords/phrases to require in a search result
x = 3
proximityparam = 200

## Helper functions...

In [5]:
def xof(x, options):
    options = ['(' + each + ')' for each in options if len(each) > 0]
    x = min(len(options), x)
    result = [' NEAR/{} '.format(proximityparam).join(list(combination))
              for combination in itertools.combinations(options, x)]
    return '((' + ') OR ('.join(result) + '))'

# define set of keywords such that the presence of one 
def attackkeywords(attacktype1):
    keywords = []
    
    # parse attacktype1 - the attack category
    if attacktype1 == '1':
        keywords.append('assassin*')
    elif attacktype1 == '2':
        keywords.append('assault*')
        keywords.append('armed')
        pass
    elif attacktype1 == '3':
        keywords.append('bomb*')
        keywords.append('explo*')
    elif attacktype1 == '4':
        keywords.append('hijack*')
    elif attacktype1 == '5':
        keywords.append('hostage')
        keywords.append('barricade*')
    elif attacktype1 == '6':
        keywords.append('hostage*')
        keywords.append('kidnap*')
    elif attacktype1 == '7':
        keywords.append('facility')
        keywords.append('infrastructure')
        keywords.append('sabotage')
    elif attacktype1 == '8':
        keywords.append('assault*')
        keywords.append('unarmed')
        
    if len(keywords) > 0:
        return '(' + ') OR ('.join(keywords) + ')'
    else:
        return ''
        
def targetkeywords(targtype1, targsubtype1_txt, corp1, target1):
    keywords = []
    
    if targtype1 == '1':
        keywords.append('business')
    elif targtype1 == '2' or targtype1 == '22':
        keywords.append('government')
        keywords.append('political')
    elif targtype1 == '3':
        keywords.append('police')
    elif targtype1 == '4':
        keywords.append('military')
    elif targtype1 == '5':
        keywords.append('abortion')
    elif targtype1 == '6':
        keywords.append('airport')
        keywords.append('aircraft')
    elif targtype1 == '7':
        keywords.append('government')
        keywords.append('embass*')
        keywords.append('consul*')
    elif targtype1 == '8':
        keywords.append('school')
        keywords.append('"educational institution"')
        keywords.append('university')
        keywords.append('teach*')
        keywords.append('professor')
    elif targtype1 == '9':
        keywords.append('supplies')
    elif targtype1 == '10':
        keywords.append('journalist')
        keywords.append('reporter')
        keywords.append('media')
    elif targtype1 == '11':
        keywords.append('maritime')
        keywords.append('fishing')
        keywords.append('"oil tanker"')
        keywords.append('ferr*')
        keywords.append('yacht')
    elif targtype1 == '12':
        keywords.append('NGO')
        keywords.append('"non-governmental organization"')
    elif targtype1 == '15':
        keywords.append('religious')
        keywords.append('church')
        keywords.append('mosque')
        keywords.append('synagogue')
        keywords.append('imam')
        keywords.append('priest')
        keywords.append('bishop')
    elif targtype1 == '16':
        keywords.append('telecom*')
        keywords.append('transmitter')
        keywords.append('tower')
    elif targtype1 == '18':
        keywords.append('tourist')
        keywords.append('"tour bus*"')
        keywords.append('tour')
    elif targtype1 == '19':
        keywords.append('"public transport*"')
    elif targtype1 == '21':
        keywords.append('utilit*')
        keywords.append('"power line"')
        keywords.append('pipeline')
        keywords.append('transformer')
        keywords.append('"high tension line"')
        keywords.append('substation')
        keywords.append('lamppost')
        keywords.append('"street light"')

    targsubtype1_txt.replace('/Other Personnel', '')
    targsubtype1_txt.replace('/Facility', '')
    targsubtype1_txt.replace('/Ethnicity Identified', '')
    targsubtype1_txt.replace('Religion Identified', 'Religious')
    if targsubtype1_txt == 'Labor Union Related':
        targsubtype1_txt = 'Labor Union/Union'
    if targsubtype1_txt == 'Affiliated Institution':
        targsubtype1_txt = ''
    if targsubtype1_txt == 'Named Citizen':
        targsubtype1_txt = ''
    if targsubtype1_txt == 'Other (including online news agencies)':
        targsubtype1_txt = ''
    if targsubtype1_txt == 'Other Personnel':
        targsubtype1_txt = ''
    if targsubtype1_txt == 'Clinics':
        targsubtype1_txt = 'Abortion Clinics'
    if targsubtype1_txt == 'Personnel':
        targsubtype1_txt = 'Abortion Personnel'
    if targsubtype1_txt.count('(') > 0 or targsubtype1_txt.count(')') > 0:
        regex = re.compile(".*?\((.*?)\)")
        result = re.findall(regex, targsubtype1_txt)
        targsubtype1_txt = targsubtype1_txt[:targsubtype1_txt.find('(' + result[0] + ')')-1]
    
    targsubtype1_txt = ['"' + each.strip().rstrip() + '"' for each in targsubtype1_txt.split('/')]
    keywords += targsubtype1_txt
    
    if len(corp1) > 0:
        keywords.append('"' + corp1 + '"')
    
    if len(target1) > 0:
        keywords.append('"' + target1 + '"')
    
    keywords = [k for k in keywords if len(k) > 1]
    if len(keywords) > 0:
        return '(' + ') OR ('.join(keywords) + ')'
    else:
        return ''
    
    
def perpkeywords(gname):
    if len(gname) > 2 and gname != 'Unknown':
        return '("' + gname + '")'
    else:
        return ''

def weaponkeywords(suicide, attacktype1, weaptype1, weaptype2):
    keywords = []
    
    if weaptype1 == '1':
        keywords.append('biological')
    elif weaptype1 == '2':
        keywords.append('chemical')
    elif weaptype1 == '3':
        keywords.append('radiological')
        keywords.append('radioactive')
        keywords.append('radiation')
    elif weaptype1 == '4':
        keywords.append('nuclear')
    elif weaptype1 == '5':
        keywords.append('firearm')
        keywords.append('gun')
    elif weaptype1 == '6' and attacktype1 != '3':
        keywords.append('bomb*')
        keywords.append('explo*')
    elif weaptype1 == '7':
        keywords.append('fake')
    elif weaptype1 == '8':
        keywords.append('incendiary')
        keywords.append('arson')
        keywords.append('combustible')
        keywords.append('flammable')
        keywords.append('inflammable')
        keywords.append('fire')
    elif weaptype1 == '9':
        keywords.append('melee')
    elif weaptype1 == '10':
        keywords.append('vehicle')
        keywords.append('car')
        keywords.append('bus')
        keywords.append('truck')
        keywords.append('van')
        keywords.append('automobile')
    elif weaptype1 == '11' and attacktype1 != '7':
        keywords.append('sabotage')
    
    if weaptype2 == '1':
        keywords.append('poison*')
    elif weaptype2 == '30':
        keywords.append('explo*')
    elif weaptype2 == '2':
        keywords.append('automatic')
        keywords.append('semi-automatic')
    elif weaptype2 == '3':
        keywords.append('handgun')
    elif weaptype2 == '4':
        keywords.append('rifle')
        keywords.append('shotgun')
    elif (weaptype2 == '5' or weaptype2 == '6') and weaptype1 != '5':
        keywords.append('firearm')
        keywords.append('gun')
    elif weaptype2 == '7':
        keywords.append('grenade')
    elif weaptype2 == '8':
        keywords.append('mine')
    elif weaptype2 == '9':
        for keyword in ['"parcel bomb"', '"mail bomb"', '"package bomb"', '"note bomb"', '"message bomb"',
                        '"gift bomb"', '"present bomb"','"delivery bomb"', '"surprise bomb"', '"postal bomb"',
                        '"post bomb"']:
            keywords.append(keyword)
    elif weaptype2 == '10':
        keywords.append('"pressure trigger"')
    elif weaptype2 == '11':
        for keyword in ['projectile', 'rocket', 'mortar', 'RPG', 'missile']:
            keywords.append(keyword)
    elif weaptype2 == '12':
        for keyword in ['"remote device"', 'trigger', 'detonate']:
            keywords.append(keyword)
    elif weaptype2 == '13' and suicide != '1':
        keywords.append('suicide')
    elif weaptype2 == '14':
        keywords.append('"time fuse"')
    elif weaptype2 == '15' and weaptype1 != '10':
        keywords.append('vehicle')
        keywords.append('car')
        keywords.append('bus')
        keywords.append('truck')
        keywords.append('van')
        keywords.append('automobile')
    elif (weaptype2 == '16' or weaptype2 == '17') and weaptype1 != '6' and attacktype1 != '3':
        keywords.append('bomb*')
        keywords.append('explo*')
    elif weaptype2 == '28':
        keywords.append('dynamite')
        keywords.append('tnt')
    elif weaptype2 == '29':
        keywords.append('"sticky bomb"')
    elif weaptype2 == '18' and weaptype1 != '8':
        keywords.append('incendiary')
        keywords.append('arson')
        keywords.append('combustible')
        keywords.append('flammable')
        keywords.append('inflammable')
        keywords.append('fire')
    elif weaptype2 == '19':
        keywords.append('molotov')
        keywords.append('"petrol bomb"')
    elif weaptype2 == '20':
        keywords.append('gasoline')
        keywords.append('alcohol')
    elif weaptype2 == '21':
        keywords.append('blunt')
    elif weaptype2 == '22':
        keywords.append('fist')
        keywords.append('punch*')
        keywords.append('beat*')
        keywords.append('kick*')
    elif weaptype2 == '23':
        keywords.append('knife')
        keywords.append('sword')
        keywords.append('stab')
    elif weaptype2 == '24':
        keywords.append('rope')
        keywords.append('strangl*')
    elif weaptype2 == '26':
        keywords.append('suffocat*')
    
    keywords = [k for k in keywords if len(k) > 1]
    if len(keywords) > 0:
        return '(' + ') OR ('.join(keywords) + ')'
    else:
        return ''

def misckeywords(attacktype1, ishostkid, ransom, suicide):
    keywords = []
    
    if suicide == '1':
        keywords.append('suicide')
    if ransom == '1':
        keywords.append('ransom')
    if ishostkid == '1' and attacktype1 != '5' and attacktype1 != '6':
        keywords.append('hostage*')
        keywords.append('kidnap*')
    
    keywords = [k for k in keywords if len(k) > 1]
    if len(keywords) > 0:
        return '(' + ') OR ('.join(keywords) + ')'
    else:
        return ''
    
def testing():
    # for every query, start by turning on the result page options

    counter = 0
    f = open(os.path.join(datapath, 'events/terrorism.csv'))
    event_csv = csv.reader(f)

    # scanning through each 
    for line in event_csv:

        # incr counter
        counter += 1

        # store the header
        if counter == 1:
            header = line
            continue

        if counter > 34:
            break

        # skip this line if...
        if (# the attack wasn't in the US,
            line[header.index('country_txt')] != 'United States'):
            continue

        # location and date
        location = (line[header.index('city')] + ', ' + line[header.index('provstate')])
        if line[header.index('iday')] != str(0):
            d0 = datetime.datetime(int(line[header.index('iyear')]),
                                   int(line[header.index('imonth')]),
                                   int(line[header.index('iday')]))
        else:
            d0 = datetime.datetime(int(line[header.index('iyear')]),
                                   int(line[header.index('imonth')]), 1) #woah this is very wrong
        d1 = d0 + timedelta(days=50)

        # (City, State) AND ((shooting) or (bombing) or (bomb) or (violence) or (murder) or (terrorism))
        #query = ('FT(' + location + ') AND (FT(shooting) OR FT(bombing) OR ' +
        #         'FT(bomb) OR FT(violence) OR FT(murder) OR FT(terrorism)) ' +
        #         'AND PD(' + d0.strftime('%Y%m%d') + '-' + d1.strftime('%Y%m%d') + ')')

        query = ('FT(' + location + ') AND PD(' + d0.strftime('%Y%m%d') + '-' + d1.strftime('%Y%m%d') + ') AND ' +
                 xof(1,[
                     attackkeywords(line[header.index('attacktype1')]),
                     targetkeywords(line[header.index('targtype1')],
                                    line[header.index('targsubtype1_txt')],
                                    line[header.index('corp1')],
                                    line[header.index('target1')]),
                     perpkeywords(line[header.index('gname')]),
                     weaponkeywords(line[header.index('suicide')],
                                    line[header.index('attacktype1')],
                                    line[header.index('weaptype1')],
                                    line[header.index('weapsubtype1')]),
                     misckeywords(line[header.index('attacktype1')],
                                  line[header.index('ishostkid')],
                                  line[header.index('ransom')],
                                  line[header.index('suicide')]),
                     '(terroris*)'
                 ]))
        
        print(query)
        print(counter)

## Define spider...

In [6]:
totalprocessed = 0

class articleSpider(scrapy.Spider):
    name = 'terrorism'
    custom_settings = {'HTTPERROR_ALLOWED_CODES': [500],
                      'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1},
                      'LOG_LEVEL': 'WARNING'}

    def start_requests(self):
        global totalprocessed
        
        # for every query, start by turning on the result page options
        
        counter = 0
        f = open(os.path.join(datapath, 'events/terrorism.csv'))
        event_csv = csv.reader(f)
        
        # scanning through each 
        for line in event_csv:
            # incr counter
            counter += 1

            # store the header
            if counter == 1:
                header = line
                continue
                
            # skip this line if...
            if (# the attack wasn't in the US,
                line[header.index('country_txt')] != 'United States'):
                continue
                
            # location and date
            location = ('"' + line[header.index('city')] + '" OR "' + line[header.index('provstate')] + '"')
            if line[header.index('iday')] != str(0):
                d0 = datetime.datetime(int(line[header.index('iyear')]),
                                       int(line[header.index('imonth')]),
                                       int(line[header.index('iday')]))
            else:
                d0 = datetime.datetime(int(line[header.index('iyear')]),
                                       int(line[header.index('imonth')]), 1) #woah this is very wrong
            d1 = d0 + timedelta(days=50)
            
            # (City, State) AND ((shooting) or (bombing) or (bomb) or (violence) or (murder) or (terrorism))
            #query = ('FT(' + location + ') AND (FT(shooting) OR FT(bombing) OR ' +
            #         'FT(bomb) OR FT(violence) OR FT(murder) OR FT(terrorism)) ' +
            #         'AND PD(' + d0.strftime('%Y%m%d') + '-' + d1.strftime('%Y%m%d') + ')')
            
            query = ('PD(' + d0.strftime('%Y%m%d') + '-' + d1.strftime('%Y%m%d') + ') AND (' + location + ') NEAR/{} '.format(proximityparam) +
                     xof(x,[
                         attackkeywords(line[header.index('attacktype1')]),
                         targetkeywords(line[header.index('targtype1')],
                                        line[header.index('targsubtype1_txt')],
                                        line[header.index('corp1')],
                                        line[header.index('target1')]),
                         perpkeywords(line[header.index('gname')]),
                         weaponkeywords(line[header.index('suicide')],
                                        line[header.index('attacktype1')],
                                        line[header.index('weaptype1')],
                                        line[header.index('weapsubtype1')]),
                         misckeywords(line[header.index('attacktype1')],
                                      line[header.index('ishostkid')],
                                      line[header.index('ransom')],
                                      line[header.index('suicide')]),
                         'terroris*'
                     ]))
            
            # if no results exist at all, search is a-go as before
            if articles is not None:
                if np.size(articles[databaseindices==counter]) == 0:
                    missing = 'All'
                else:
                    count = min([int(a['resultscount']) for a in articles[databaseindices==counter] if a['parents'] == 0])
                    missing = set(np.arange(1, count+1)) - set([int(s['searchindex']) for s in articles[databaseindices==counter]])
            else:
                missing = 'All'
            
            # don't do any search if no results are missing for this event
            if not missing:
                continue
    
            totalprocessed += 1
            logging.warning(totalprocessed)
            logging.warning(missing)
            logging.warning(counter)
            yield scrapy.Request('https://search.proquest.com/advanced.showresultpageoptions?site=news',
                                     callback=self.startform, dont_filter=True, 
                                     meta={'originalquery': query, 'query': query, 'databaseindex': counter,
                                           'originalstart': d0, 'originalend': d1, 'location': location, 
                                           'querystart': d0, 'queryend': d1, 'parents': 0, 'missing': missing}
                                )
    
    # starts the form that must be filled out to search w/ our query
    def startform(self, response):
        # start the search form
        yield scrapy.Request('https://search.proquest.com/news/advanced?accountid=13314',
                             callback=self.query, dont_filter=True, meta=response.meta)
    
    # fills out form and initiates search
    def query(self, response):
        # fill it out and search
        yield scrapy.FormRequest.from_response(response, dont_filter=True, formid='searchForm',
                                               formdata={'queryTermField': response.meta['query'],'fullTextLimit':'on',
                                                         'sortType':'DateAsc', 'includeDuplicate':'on'},
                                               callback=self.parsePages, clickdata={'id': 'searchToResultPage'},
                                               meta=response.meta)
    
    # sets up inspection of each page of results generated by search
    def parsePages(self, response):
        sel = Selector(response)
        
        resultscount = sel.xpath("//h1[@id='pqResultsCount']/text()").extract()[0]
        resultscount = int(resultscount[:resultscount.find(' ')].replace(',', ''))
        maxpages = resultscount // 100
        urlparts = [response.url[:response.url.find('/1')+1], response.url[response.url.find('1?')+1:]]
        
        # what i do next depends on what's missing
        for i in [p for p in range(40) if p <= maxpages]:
            request = scrapy.Request(str(i+1).join(urlparts), callback=self.parse, dont_filter=True, meta=response.meta)

            if response.meta['missing'] is 'All':
                yield request
            elif 0 < len(set(np.arange((i*100)+1+(response.meta['parents']*4000),min((i+1)*100+(response.meta['parents']*4000),
                                                                                     resultscount+(response.meta['parents']*4000))+1)
                   ).intersection(response.meta['missing'])):
                yield request
            elif i+1 == 40 and len([m for m in response.meta['missing'] if m > 4000]) > 0:
                yield request
            
        # extra work for missing results beyond the 4000th
            
    def parse(self, response):
        sel = Selector(response)
        
        resultscount = sel.xpath("//h1[@id='pqResultsCount']/text()").extract()[0]
        resultscount = int(resultscount[:resultscount.find(' ')].replace(',', ''))
        
        indices = sel.xpath("//li[@class='resultItem ltr']/div/span[@class='indexing ']/text()").extract()
        titles = sel.xpath("//div[@class='results_list_copy  ']/h3/a/@title").extract()
        links = sel.xpath("//div[@class='results_list_copy  ']/h3/a/@href").extract()
        info = [(' '.join(path.xpath(".//span[@class='titleAuthorETC small']//text()").extract())).replace('\n', '') for path in sel.xpath("//div[@class='results_list_copy  ']")]
        
        # try to extract date from each article's info
        dates = []
        for each in info:
            try:
                date = each[each.find(']')+1:each.find(': ')]
                try:
                    date = datetime.datetime.strptime(date, '%d %b %Y')
                except ValueError:
                    try:
                        date = datetime.datetime.strptime(each[each.find(']')+1:], '%d %B %Y.')
                    except ValueError:
                        try:
                            date = datetime.datetime.strptime(each[each.find(']')+1:each.rfind(':')], '%d %B %Y')
                        except ValueError:
                            try:
                                date = datetime.datetime.strptime(each[each.rfind('(')+1:(each.rfind(')'))], '%b %d, %Y')
                            except ValueError:
                                date = None
            except IndexError:
                date = None
            dates.append(date)
        
        # correct me if im wrong but i assume all of these lists are of the same length
        assert (len(indices) + len(titles) + len(links) + len(info) + len(dates)) == (len(indices) + len(indices) + len(indices) + len(indices) + len(indices))
        
        # now populate an ArticleItem() for each result
        for i in range(len(indices)):
            
            # but skip if missing parameter suggests that the articleitem has already been processed
            if response.meta['missing'] is not 'All':
                if int(indices[i]) + response.meta['parents']*4000 not in response.meta['missing']:
                    continue
            
            article = ArticleItem()
            
            # defined by database
            article['databaseindex'] = response.meta['databaseindex']
            article['originalquery'] = response.meta['originalquery']
            article['originalstart'] = str(response.meta['originalstart'])
            article['originalend'] = str(response.meta['originalend'])
            article['location'] = response.meta['location']
            
            # defined prior to or at start of search
            article['resultscount'] = resultscount + response.meta['parents']*4000
            article['query'] = response.meta['query']
            article['querystart'] = str(response.meta['querystart'])
            article['queryend'] = str(response.meta['queryend'])
            article['parents'] = int(response.meta['parents'])

            # defined by item itself
            article['searchindex'] = int(indices[i]) + response.meta['parents']*4000
            article['title'] = titles[i]
            article['info'] = info[i]
            article['link']  = links[i]

            # derived from those above
            article['daysFrom'] = str(dates[i])
            
            yield article
            
        # set up successive searches for when there are more than 4000 results
        limitstring = 'You have reached the maximum number of search results that are displayed.'
        limit = sel.xpath("//p[@class='errorMessageHeaderText']/text()")
        if limit:
            if limitstring in limit.extract()[0]:
                request = scrapy.Request('https://search.proquest.com/advanced.showresultpageoptions?site=news',
                                         callback=self.startform, dont_filter=True, meta=response.meta)
                
                request.meta['parents'] += 1
                request.meta['querystart'] = [d for d in dates if d is not None][-1]
                request.meta['query'] = ('FT(' + request.meta['location'] +
                                         ') AND (FT(shooting) OR FT(bombing) OR ' +
                                         'FT(bomb) OR FT(violence) OR FT(murder) OR FT(terrorism)) ' +
                                         'AND PD(' + request.meta['querystart'].strftime('%Y%m%d') + '-' +
                                         request.meta['queryend'].strftime('%Y%m%d') + ')')
                yield request

In [7]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})

process.crawl(articleSpider)
process.start()

2018-11-16 07:25:48 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2018-11-16 07:25:48 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2018-11-16 07:26:02 [scrapy.core.scraper] ERROR: Spider error processing <GET https://search.proquest.com/news/advanced/BA910B92F5F541B5PQ?accountid=13314> (referer: https://search.proquest.com/news/advanced?accountid=13314)
Traceback (most recent call last):
  File "/anaconda/lib/python3.6/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
    yield next(it)
  File "/anaconda/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
    for x in result:
  File "/anaconda/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "/anaconda/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
  

In [None]:
PD(19751027-19751216) AND FT(Chicago, Illinois) NEAR/150 (((((bomb*) OR (explo*)) NEAR/150 ((business) OR ("Bank") OR ("Commerce") OR ("Bank")) NEAR/150 (("Fuerzas Armadas de Liberacion Nacional (FALN)"))) OR (((bomb*) OR (explo*)) NEAR/150 ((business) OR ("Bank") OR ("Commerce") OR ("Bank")) NEAR/150 (terroris*)) OR (((bomb*) OR (explo*)) NEAR/150 (("Fuerzas Armadas de Liberacion Nacional (FALN)")) NEAR/150 (terroris*)) OR (((business) OR ("Bank") OR ("Commerce") OR ("Bank")) NEAR/150 (("Fuerzas Armadas de Liberacion Nacional (FALN)")) NEAR/150 (terroris*))))

PD(19751027-19751216) AND FT(Chicago, Illinois) NEAR/150 (((bomb*) OR (explo*)) NEAR/150 ((business) OR ("Bank") OR ("Commerce") OR ("Bank")) NEAR/150 (("Fuerzas Armadas de Liberacion Nacional (FALN)")))

In [None]:
PD(19751027-19751216) AND FT(Chicago, Illinois) NEAR/150 (((bomb) OR (explosion)) NEAR/150 ((business) OR ("Bank") OR ("Commerce") OR ("Bank")) NEAR/150 (("Fuerzas Armadas de Liberacion Nacional (FALN)")))