# Accessing Data via Web APIs: Solutions

In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime
from pynytimes import NYTAPI

In [None]:
# Put your API key here
api_key = ""

In [None]:
# Or, read your key locally
path_to_key = ""
with open(path_to_key, "r") as f:
    api_key = f.read()

In [None]:
nyt = NYTAPI(api_key, parse_dates=True)

### Challenge 1: Find the top stories for a section

- Choose 2 sections. Grab their top stories and store them in two separate lists.
- How many stories are each in section?
- What is the title of the first story in each list?

In [None]:
# Sports
section = "sports"
top_sports_stories = nyt.top_stories(section=section)
print(f"There are {len(top_sports_stories)} {section} stories.")

In [None]:
# Grab first story
top_sport_story = top_sports_stories[0]
top_sport_story_title = top_sport_story["title"]
top_sport_story_title

In [None]:
# Technology
section = "technology"
top_tech_stories = nyt.top_stories(section=section)
print(f"There are {len(top_tech_stories)} {section} stories")

In [None]:
# Grab first story
top_tech_story = top_tech_stories[0]
top_tech_story_title = top_tech_story["title"]
top_tech_story_title

### Challenge 2: Most Shared Stories

The `most_shared` method is similiar to `most_viewed` except that it has an argument called `method` which is used to show the most shared articles using `'email'` or `'facebook'`.

- Grab the most shared articles for both methods for the past month.
- How many articles show up in both lists? (Hint: use the `uri` key)
- Bonus: Use the [Shared Article](https://developer.nytimes.com/docs/most-popular-product/1/types/SharedArticle) schema table to help you answer a question you may have about the data.

| Attribute      | Data Type | Definition      |
| ----------- | ----------- | ----------- |
| url      | string       | Article's URL.       |
| adx_keywords   | string        | Semicolon separated list of keywords.        |
| subsection   | string        | Article's subsection (e.g. Politics). Can be empty |
| column   | string        | Deprecated. Set to null.        |
| eta_id   | integer        | Deprecated. Set to 0.|
| section   | string        | Article's section (e.g. Sports).        |
| id   | integer        | Asset ID number (e.g. 100000007772696).        |
| asset_id   | integer        | Asset ID number (e.g. 100000007772696).        |
| nytdsection   | string        | Article's section|
| byline   | string        | Article's byline (e.g. By Thomas L. Friedman).        |
| type   | string        | Asset type (e.g. Article, Interactive, ...).        |
| title   | string        | Article's headline (e.g. When the Cellos Play, the Cows Come Home).        |
| abstract   | string        | Brief summary of the article.|
| published_date   | string        | When the article was published on the web (e.g. 2021-04-19).        |
| source   | string        | Publisher (e.g. New York Times).        |
| updated   | string        | When the article was last updated (e.g. 2021-05-12 06:32:03).|
| des_facet   | array        | Array of description facets (e.g. Quarantine (Life and Culture)).        |
| org_facet   | array        | Array of organization facets (e.g. Sullivan Street Bakery).        |
| per_facet   | array        | Array of person facets (e.g. Bittman, Mark).        |
| geo_facet   | array        | Array of geographic facets (e.g. Canada).        |
| media   | array        | Array of images.        |
| media.type   | string        | Asset type (e.g. image).        |
| media.subtype   | string        | Asset subtype (e.g. photo).        |
| media.caption   | string        | Media caption        |
| media.copyright   | string        | Media credit        |
| media.approved_for_syndication   | boolean        | Whether media is approved for syndication.        |
| media.media-metadata   | array        | Media metadata (url, width, height, ...).        |
| media.media-metadata.url   | string        | Image's URL.        |
| media.media-metadata.format   | string        | Image's crop name     |
| media.media-metadata.height   | integer        | Image's height |
| media.media-metadata.width   | integer        | Image's width      |
| uri   | string        | An article's globally unique identifier.      |

In [None]:
# Get most shared stories
email = nyt.most_shared(days=30, method = 'email')
facebook = nyt.most_shared(days=30, method = 'facebook')

In [None]:
# Get unique identifier for each story
email_ids = [story["uri"] for story in email]
facebook_ids = [story["uri"] for story in facebook]

In [None]:
# Calculate the intersection of unique IDs
len(set(email_ids).intersection(set(facebook_ids)))

### Challenge 3: Article Searching

- Retrieve a set of articles for a query of your choice.
- Use a relevant time interval in constructing your `dates` dictionary
- Use `type_of_material` and `section_name` as keys in your `options` dictionary.
    - For `type_of_material` values refer to this [list](https://github.com/michadenheijer/pynytimes/blob/main/VALID_SEARCH_OPTIONS.md#type-of-material-values).
    - For `section_name` values refer to this [list](https://github.com/michadenheijer/pynytimes/blob/main/VALID_SEARCH_OPTIONS.md#section-name-values).

In [None]:
# Example: query about COVID in the first 6 months of the pandemic
query = "COVID-19"
begin = datetime(2020, 2, 15)
end = datetime(2020, 8, 15)
date_dict = {"begin": begin, "end": end}

options_dict = {
    "sort": "oldest",
    "sources": ["New York Times", "AP"],
    "type_of_material": [
        "News Analysis",
        "News",
        "Article",
        "Column",
        "Editorial",
        "Front Page"
    ]
}

articles = nyt.article_search(
    query=query,
    results=100,
    dates=date_dict,
    options=options_dict)

In [None]:
print(f"{len(articles)} articles about {query}.")
# Print first article title
print(articles[0]['headline']['main'])

### Challenge 4. Additional Data Analysis

- What are the 3 most positive and negative texts?
- Using the VADER thresholds for positive, neutral, and negative, how many articles qualify for each of those labels?

In [None]:
from pyprojroot import here
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
df = pd.read_pickle(here("data/election2020_articles.pkl"))
# Initialize analyzer object
analyzer = SentimentIntensityAnalyzer()
# Calculate the polarity scores of the lead paragraph and save it in df
df["sentiment"] = df.lead_paragraph.apply(analyzer.polarity_scores)
df["sentiment"] = df["sentiment"].apply(lambda x: x["compound"])

In [None]:
# Most positive texts
df.sort_values("sentiment", ascending = False)["headline.main"].iloc[:3].tolist()

In [None]:
# Most negative texts
df.sort_values("sentiment", ascending = True)["headline.main"].iloc[:3].tolist()

In [None]:
# Proportion of positive, negative, and neutral texts
def bin_func(x):
    if x > 0.05:
        return "positive"
    elif x < -.05:
        return "negative"
    else:
        return "neutral"
# Calculate counts
df.sentiment.apply(bin_func).value_counts()