### Import Required Libraries and Set Up Environment Variables

In [69]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [71]:
# Set environment variables from the .env in the local environment
load_dotenv('.env')

nyt_api_key = os.getenv('NYT')
tmdb_api_key = os.getenv('TMDB')


### Access the New York Times API

In [73]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
query_url = (
    f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
    + f'&fq={filter_query}&sort={sort}&fl={field_list}'
)

In [77]:
# Print `response_data variable`
response = requests.get(query_url)
response

#checking to see what response I am getting

<Response [200]>

In [49]:
# Create an empty list to store the reviews
results_list = []

# loop through pages 0-19
for page_number in range(20):   

    # Set up the base query URL
    query_url = f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
   
    # create query with a page number
    query_url_with_page = f'{query_url}&page={page_number}'
       
    # API results show 10 articles at a time
    print(f"Checked page: {page_number}")
    
    # Make a "GET" request and retrieve the JSON
    response = requests.get(query_url_with_page).json()
    
    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)
   
    if not results["response"]["docs"]:
        print(f"No results on page {page_number}")
        break  # Break from the loop if no results
    try:
        # If there are results, loop through and save each review
        for review in results["response"]["docs"]:
            results_list.append(review)
    except: 
        # Print the reviews added from the current page
        print(f"Page {page_number} reviews added to list.")

Checked page: 0
Checked page: 1
Checked page: 2
Checked page: 3
Checked page: 4
Checked page: 5
Checked page: 6
Checked page: 7
Checked page: 8
Checked page: 9
Checked page: 10
Checked page: 11
Checked page: 12
Checked page: 13
Checked page: 14
Checked page: 15
Checked page: 16
Checked page: 17
Checked page: 18
Checked page: 19


In [35]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data

for review in results_list[:5]:
    print(json.dumps(results, indent=4))  


{
    "status": "OK",
    "copyright": "Copyright (c) 2024 The New York Times Company. All Rights Reserved.",
    "response": {
        "docs": [
            {
                "abstract": "The gunman used a semiautomatic handgun that is essentially a shortened version of an AR-15-style rifle. He had purchased it legally six days before the attack.",
                "web_url": "https://www.nytimes.com/2021/03/26/us/boulder-shooter-motive.html",
                "snippet": "The gunman used a semiautomatic handgun that is essentially a shortened version of an AR-15-style rifle. He had purchased it legally six days before the attack.",
                "lead_paragraph": "BOULDER, Colo. \u2014 Investigators searching for answers after the mass shooting in Boulder, Colo., this week still do not know why a gunman shot and killed 10 people at a crowded grocery store, the police chief said on Friday.",
                "print_section": "A",
                "print_page": "18",
                "sour

In [79]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()

import pandas as pd
from pandas import json_normalize

# Convert results_list to a Pandas DataFrame
reviews_df = json_normalize(results_list)

# Display the DataFrame
reviews_df.head()




Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,keywords,pub_date,...,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,subsection_name
0,The gunman used a semiautomatic handgun that i...,https://www.nytimes.com/2021/03/26/us/boulder-...,The gunman used a semiautomatic handgun that i...,"BOULDER, Colo. — Investigators searching for a...",A,18.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'Boulder, Colo, ...",2021-03-26T17:39:13+0000,...,,,Motive in Boulder Shooting of 10 People Is Sti...,,,,By Jack Healy and Nicholas Bogel-Burroughs,"[{'firstname': 'Jack', 'middlename': None, 'la...",,
1,The nonpartisan budget office also said that i...,https://www.nytimes.com/2023/02/15/us/politics...,The nonpartisan budget office also said that i...,WASHINGTON — The Treasury Department’s ability...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'National Debt (...",2023-02-15T19:00:19+0000,...,,,,,,,By Alan Rappeport,"[{'firstname': 'Alan', 'middlename': None, 'la...",,Politics
2,Retailers used to absorb much of the cost of g...,https://www.nytimes.com/2021/04/29/business/co...,Retailers used to absorb much of the cost of g...,Procter & Gamble is raising prices on items li...,B,1.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","[{'name': 'subject', 'value': 'United States E...",2021-04-29T09:00:30+0000,...,,,"The Price Is Right, And Rising",,,,By Gillian Friedman,"[{'firstname': 'Gillian', 'middlename': None, ...",,
3,"The pullout, under the terms of a deal that gi...",https://www.nytimes.com/2014/05/08/world/middl...,"The pullout, under the terms of a deal that gi...","BEIRUT, Lebanon — Syria’s third-largest city, ...",A,1.0,The New York Times,"[{'rank': 0, 'subtype': 'wide', 'caption': Non...","[{'name': 'subject', 'value': 'Middle East and...",2014-05-07T11:11:40+0000,...,,,Syrian Rebels Agree to Leave a Stronghold,,,,By Anne Barnard,"[{'firstname': 'Anne', 'middlename': None, 'la...",,Middle East
4,Contestants in this contest win by revealing h...,https://parenting.blogs.nytimes.com/2015/11/11...,Contestants in this contest win by revealing h...,"“Please,” the woman says, staring directly int...",,,The New York Times,"[{'rank': 0, 'subtype': 'watch308', 'caption':...","[{'name': 'subject', 'value': 'Contests and Pr...",2015-11-11T18:14:21+0000,...,Motherlode,,,,,,By Kj Dell'Antonia,"[{'firstname': 'Kj', 'middlename': None, 'last...",,


In [25]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early


In [None]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string


In [None]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database


### Access The Movie Database API

In [None]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

In [None]:
# Create an empty list to store the results


# Create a request counter to sleep the requests after a multiple
# of 50 requests


# Loop through the titles

    # Check if we need to sleep before making a request


    # Add 1 to the request counter

    
    # Perform a "GET" request for The Movie Database


    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.
    try:
        # Get movie id
        movie_ID = response_data['results'][0]['id']

        # Make a request for a the full movie details
        response _data = requests.get(url + titles + tmdb_key_string).json()

        # Execute "GET" request with url

        
        # Extract the genre names into a list


        # Extract the spoken_languages' English name into a list


        # Extract the production_countries' name into a list


        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list

        
        # Print out the title that was found
        except:
            print(titles+"not found")


In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [None]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [None]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [None]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [None]:
# Drop "byline.person" column


In [None]:
# Delete duplicate rows and reset index


In [None]:
# Export data to CSV without the index
