## Challenge¶
Do a little scraping or API-calling of your own. Pick a new website and see what you can get out of it. Expect that you'll run into bugs and blind alleys, and rely on your mentor to help you get through.

Formally, your goal is to write a scraper that will:

1. Return specific pieces of information (rather than just downloading a whole page)
2. Iterate over multiple pages/queries
3. Save the data to your computer

Once you have your data, compute some statistical summaries and/or visualizations that give you some new insights into your scraping topic of interest. Write up a report from scraping code to summary and share it with your mentor.

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess


class WikiSpider(scrapy.Spider):
    name = "WS"
  
    
    
    # Here is where we insert our API call.
    start_urls = [
        'https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=linkshere&titles=Monty_Python&lhprop=title%7Credirect'
        ]

    # Identifying the information we want from the query response and extracting it using xpath.
    def parse(self, response):
        for item in response.xpath('//lh'):
            # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
            # Other codes indicate links from 'Talk' pages, etc.  Since we are only interested in entries, we filter:
            if item.xpath('@ns').extract_first() == '0':
                yield {
                    'title': item.xpath('@title').extract_first() 
                    }
        # Getting the information needed to continue to the next ten entries.
        next_page = response.xpath('continue/@lhcontinue').extract_first()
        
        # Recursively calling the spider to process the next ten entries, if they exist.
        if next_page is not None:
            next_page = '{}&lhcontinue={}'.format(self.start_urls[0],next_page)
            yield scrapy.Request(next_page, callback=self.parse)
            
    
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'PythonLinks.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,
    # We use CLOSESPIDER_PAGECOUNT to limit our scraper to the first 100 links.    
    'CLOSESPIDER_PAGECOUNT' : 10
})
                                         

# Starting the crawler with our spider.
process.crawl(WikiSpider)
process.start()
print('First 100 links extracted!')

In [1]:
import scrapy
import json

from scrapy.crawler import CrawlerProcess

class NWSSpider(scrapy.Spider):
    name = "NWS"

    # Here is where we insert our API call.
    start_urls = ['https://api.weather.gov/gridpoints/TOP/31,80/forecast']

    # Didn't understand xpath, but I found a Youtube video where the guy used this method to
    # extract the fields he wanted, and I DO understand this.
    def parse(self, response):
        jsonresponse = json.loads(response.text)
        for idx,feature in enumerate(jsonresponse['features']):
            yield {
                #'timestamp': feature['properties']['timestamp'],
                #'textDescription': feature['properties']['textDescription'],
                #'temperatureValue': feature['properties']['temperature']['value'],
                #'temperatureUnitCode': feature['properties']['temperature']['unitCode'],
                #'relativeHumidityValue': feature['properties']['relativeHumidity']['value'],
            }


process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'weather.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'Data science class project (ecboyes@gmail.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,
})
                                         

# Starting the crawler with our spider.
process.crawl(NWSSpider)
process.start()
print('Sucessful API call!')

Sucessful API call!


In [2]:
import pandas as pd

# Checking whether we got data 

nws= pd.read_json('weather.json', lines=True)

#nws= pd.read_json('weather.json', lines=True, orient='records')

#nws= pd.read_json('weather.json', orient=str)

print(nws.shape)
print(nws.tail())

(0, 0)
Empty DataFrame
Columns: []
Index: []


In [10]:
import requests

weather = requests.get('https://api.weather.gov/gridpoints/TOP/31,80/forecast')
#print(weather.content)
jsonresponse = json.loads(weather.content)
jsonresponse.keys()
jsonresponse['properties'].keys()

dict_keys(['updated', 'units', 'forecastGenerator', 'generatedAt', 'updateTime', 'validTimes', 'elevation', 'periods'])