### Import Required Libraries and Set Up Environment Variables

In [6]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [7]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [9]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
query_url = f"{url}api-key={nyt_api_key}&q={filter_query}&fq={filter_query}&sort={sort}&begin_date={begin_date}&end_date={end_date}&fl={field_list}"


In [10]:
# Create an empty list to store the reviews
reviews_list = []

# loop through pages 0-19
for page in range(20):
    # create query with a page number
    # API results show 10 articles at a time
    query_url = f"{url}api-key={nyt_api_key}&q={filter_query}&fq={filter_query}&sort={sort}&begin_date={begin_date}&end_date={end_date}&fl={field_list}&page={page}"

    # Make a "GET" request and retrieve the JSON
    response = requests.get(query_url).json()

    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)
    # Check if there are results
    print(f"Processing page {page} of results")

    # If no results are found, print the page number and break from the loop
    if not response["response"]["docs"]:
        print(f"No results found for page {page}")
        break

    # Try and save the reviews to the reviews_list
    try:
        # loop through the reviews["response"]["docs"] and append each review to the list
        for review in response["response"]["docs"]:
            reviews_list.append(review)
    except KeyError:
        # Print the page number that had no results then break from the loop
        print(f"No results found for page {page}")
        break
for review in response_json["response"]["docs"]:
            reviews_list.append(review)

# Print the page number that had no results then break from the loop
print(f"No results found for page {page}")


    

Processing page 0 of results
No results found for page 0


NameError: name 'response_json' is not defined

In [11]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
from pandas import json_normalize

preview = json.dumps(reviews_list[:5], indent=4)
print(preview)


[]


In [12]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
df = json_normalize(reviews_list)

In [13]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early

df['title'] = df['headline.main'].apply(lambda st: st[st.find("\u2018")+1:st.find("\u2019 Review")])



KeyError: 'headline.main'

In [14]:
# Extract 'name' and 'value' from items in "keywords" column

        # Extract 'name' and 'value'
def extract_keywords(keywords_list):
    extracted_keywords = []
    for item in keywords_list:
        name = item.get('name', '')
        value = item.get('value', '')
        keyword_item = f"{name}: {value}"
        extracted_keywords.append(keyword_item)
    return extracted_keywords   
        # Append the keyword item to the extracted_keywords list
df['extracted_keywords'] = df['keywords'].apply(extract_keywords)



# Fix the "keywords" column by converting cells from a list to a string
df['keywords'] = df['extracted_keywords'].apply(', '.join)
df.drop(columns=['extracted_keywords'], inplace=True)

KeyError: 'keywords'

In [15]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database

titles = df['title'].to_list()

KeyError: 'title'

### Access The Movie Database API

In [16]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

movie_title = "The Matrix"
query_url = f"https://api.themoviedb.org/3/search/movie?query={movie_title}{tmdb_key_string}"


In [19]:
# Create an empty list to store the results
tmdb_movies_list = []

# Create a request counter to sleep the requests after a multiple
# of 50 requests
request_counter = 1

# Loop through the titles
for title in titles:
    # Check if we need to sleep before making a request
    if request_counter % 50 == 0:
        print("Application is sleeping...")
        time.sleep(1)
    # Add 1 to the request counter
    request_counter += 1
    
    # Perform a "GET" request for The Movie Database
    query_url = f"https://api.themoviedb.org/3/search/movie?query={title}&api_key={tmdb_api_key}"
    response = requests.get(query_url)
    movie_data = response.json()
    

    # Include a try clause to search for the full movie details.

    # Use the except clause to print out a statement if a movie
    # is not found.
    try:
   
        # Get movie id
        movie_id = movie_data["results"][0]["id"]

        # Make a request for a the full movie details
        movie_details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}"
        response_details = requests.get(movie_details_url)
        movie_details = response_details.json()

        # Execute "GET" request with url
        title = movie_details['title']
        
        # Extract the genre names into a list
        genres = [genre['name'] for genre in movie_details['genres']]

        # Extract the spoken_languages' English name into a list
        spoken_languages = [lang['english_name'] for lang in movie_details['spoken_languages']]
    

        # Extract the production_countries' name into a list
        production_countries = [country['name'] for country in movie_details['production_countries']]

        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list
        movie_info = {
            'title': title,
            'genres': genres,
            'spoken_languages': spoken_languages,
            'production_countries': production_countries
        }
        
        tmdb_movies_list.append(movie_info)
        
        # Print out the title that was found
        print(f"Title '{title}' found.")
    except (IndexError, KeyError):
        print(f"Title '{title}' not found.")

NameError: name 'titles' is not defined

In [20]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
from pandas import json_normalize

preview = json.dumps(reviews_list[:5], indent=4)
print(preview)

[]


In [21]:
# Convert the results to a DataFrame
df = json_normalize(reviews_list)

### Merge and Clean the Data for Export

In [24]:
# Merge the New York Times reviews and TMDB DataFrames on title
merged_df = nyt_reviews_df.merge(tmdb_df, on='title', how='inner')



NameError: name 'nyt_reviews_df' is not defined

In [23]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing
columns_to_fix = ["genres", "production_countries", "spoken_languages"]



# Create a list of characters to remove
characters_to_remove = ["[", "]", "'"]


# Loop through the list of columns to fix
for column in columns_to_fix:
    # Loop through the list of characters to remove
    for character in characters_to_remove:
        # Replace the character with an empty string
        merged_df[column] = merged_df[column].str.replace(character, "")


    # Convert the column to type 'str'
    merged_df[column] = merged_df[column].astype('str')
    

    # Loop through characters to remove
    for character in characters_to_remove:
        # Replace the character with an empty string
        merged_df[column] = merged_df[column].str.replace(character, "")
        

# Display the fixed DataFrame
merged_df.head()



NameError: name 'merged_df' is not defined

In [25]:
# Drop "byline.person" column
merged_df.drop(columns=['byline.person'], inplace=True)



NameError: name 'merged_df' is not defined

In [26]:
# Delete duplicate rows and reset index
merged_df.drop_duplicates(subset=['title'], inplace=True)
merged_df.reset_index(drop=True, inplace=True)



NameError: name 'merged_df' is not defined

In [19]:
# Export data to CSV without the index
merged_df.to_csv('output/nyt_movie_reviews.csv', index=False)

