### Import Required Libraries and Set Up Environment Variables

In [30]:
!pip install python-dotenv
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json



In [31]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [53]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
# query_url = url + "api-key=" + nyt_api_key + "&q=" + filter_query
# query_url = f"{url}api-key={nyt_api_key}&q={filter_query}&begin_date={begin_date}&end_date={end_date}"
query_url = (
    f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
    + f'&fq={filter_query}&sort={sort}&fl={field_list}'
)

In [80]:
# Create an empty list to store the reviews
reviews_list = []

# loop through pages 0-19
# create query with a page number
# API results show 10 articles at a time
for page in range(0, 20):
    
    # extend query with page number
    query_url_page = f"{query_url}&page={str(page)}"
     
    # Make a "GET" request and retrieve the JSON
    reviews = requests.get(query_url_page).json()
    # print(reviews)
    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)
    
    # # Try and save the reviews to the reviews_list
    # # loop through the reviews["response"]["docs"] and append each review to the list
    for review in reviews["response"]["docs"]:
        try:
            reviews_list.append(reviews)
            print(f"{review['headline']['print_headline']} found! Appending review. Page {page}")
        
    # # Handle exceptions for a reviews that are not available in the NYT API
        except:
            # Print exception note
            print(f"No reviews found on page {page}.")
            # pass
            break

    
        
        
    #     # Print the page that was just retrieved      
    #     print(review['snippet'])
    #     print('---------------------------')

    #     # Print the page number that had no results then break from the loop


The Attachment Diaries found! Appending review. Page 0
What’s Love Got to Do With It? found! Appending review. Page 0
You Can Live Forever found! Appending review. Page 0
A Tourist’s  Guide to Love found! Appending review. Page 0
Intoxicating Love With a Sobering Turn found! Appending review. Page 0
One True Loves found! Appending review. Page 0
The Lost Weekend:  A Love Story found! Appending review. Page 0
An Unbending Will Meets a Shifting City found! Appending review. Page 0
They Have a Humdrum Kind of Love found! Appending review. Page 0
Love in the Time Of Fentanyl found! Appending review. Page 0
Pamela, a Love Story found! Appending review. Page 1
In From the Side found! Appending review. Page 1
After Love found! Appending review. Page 1
Alcarràs found! Appending review. Page 1
Nelly &amp; Nadine found! Appending review. Page 1
A Love Whose Name Is Often Spoken found! Appending review. Page 1
The Sound  Of Christmas found! Appending review. Page 1
A Few Good Men, Some With Secre

In [81]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
json_string = json.dumps(reviews_list, indent=4)
sliced_json_string = json_string[:json_string.index("One True Loves")]
print(sliced_json_string)

[
    {
        "status": "OK",
        "copyright": "Copyright (c) 2024 The New York Times Company. All Rights Reserved.",
        "response": {
            "docs": [
                {
                    "web_url": "https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html",
                    "snippet": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
                    "source": "The New York Times",
                    "headline": {
                        "main": "\u2018The Attachment Diaries\u2019 Review: Love, Sick",
                        "kicker": null,
                        "content_kicker": null,
                        "print_headline": "The Attachment Diaries",
                        "name": null,
                        "seo": null,
                        "sub": null
                    },
                    "keywords": [
                        {
                            "na

In [89]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
results_df = pd.json_normalize(json_string)
results_df

NotImplementedError: 

In [85]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early
results_df['title'] = results_df.headline.main.apply(lambda st: st[st.find("\u2018")+1:st.find("\u2019 Review")])

AttributeError: 'DataFrame' object has no attribute 'headline'

In [8]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string


In [9]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database


### Access The Movie Database API

In [10]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key


# # Documentation: https://developer.themoviedb.org/docs/search-and-query-for-details
# query_url = "https://api.themoviedb.org/3/search/movie?query="

# title = "Everything Everywhere All at Once"

# # Execute "GET" request with url
# response_data = requests.get(query_url + title + "&api_key=" + api_key)

In [11]:
# Create an empty list to store the results


# Create a request counter to sleep the requests after a multiple
# of 50 requests


# Loop through the titles

    # Check if we need to sleep before making a request


    # Add 1 to the request counter

    
    # Perform a "GET" request for The Movie Database


    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.

        # Get movie id


        # Make a request for a the full movie details


        # Execute "GET" request with url

        
        # Extract the genre names into a list


        # Extract the spoken_languages' English name into a list


        # Extract the production_countries' name into a list


        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list

        
        # Print out the title that was found



In [12]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [13]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [14]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [15]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [16]:
# Drop "byline.person" column


In [17]:
# Delete duplicate rows and reset index


In [18]:
# Export data to CSV without the index
