### Import Required Libraries and Set Up Environment Variables

In [150]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import re

In [151]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [152]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build the NYT URL
# query_url = f"{url}api-key={nyt_api_key}&fq={filter_query}&sort={sort}&fl={field_list}&begin_date={begin_date}&end_date={end_date}"
base_url = f"{url}api-key={nyt_api_key}&fq={filter_query}&fl={field_list}"


In [153]:
# Create an empty list to store the reviews
reviews_list = []

# loop through pages 0-19
for page in range(0, 5):

    # create query with a page number
    # API results show 10 articles at a time
    query_url = f"{base_url}&page={str(page)}"    
    
    # Make a "GET" request and retrieve the JSON
    reviews = requests.get(query_url).json()
    print(reviews)

    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(6)

    # Try and save the reviews to the reviews_list
    try:
        # loop through the reviews["response"]["docs"] and append each review to the list
        for review in reviews["response"]["docs"]:
            reviews_list.append(review)

        # Print the page that was just retrieved
        print(f'Checked page {page}')

    except:
        # Print the page number that had no results then break from the loop
        print(f"No results for page number {page}")
        break

print(f"Found {len(reviews_list)} matching reviews")

{'status': 'OK', 'copyright': 'Copyright (c) 2024 The New York Times Company. All Rights Reserved.', 'response': {'docs': [{'web_url': 'https://www.nytimes.com/2023/06/20/movies/sublime-review.html', 'snippet': 'A teenager dreams of pop songs, and his best friend, in Mariano Biasin’s tender gay coming-of-age drama.', 'source': 'The New York Times', 'headline': {'main': '‘Sublime’ Review: Two Boys, One in Love', 'kicker': None, 'content_kicker': None, 'print_headline': 'Sublime', 'name': None, 'seo': None, 'sub': None}, 'keywords': [{'name': 'subject', 'value': 'Movies', 'rank': 1, 'major': 'N'}, {'name': 'creative_works', 'value': 'Sublime (Movie)', 'rank': 2, 'major': 'N'}, {'name': 'persons', 'value': 'Biasin, Mariano', 'rank': 3, 'major': 'N'}], 'pub_date': '2023-06-20T17:48:07+0000', 'byline': {'original': 'By Erik Piepenburg', 'person': [{'firstname': 'Erik', 'middlename': None, 'lastname': 'Piepenburg', 'qualifier': None, 'title': None, 'role': 'reported', 'organization': '', 'ra

In [154]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(reviews_list, indent=4))

[
    {
        "web_url": "https://www.nytimes.com/2023/06/20/movies/sublime-review.html",
        "snippet": "A teenager dreams of pop songs, and his best friend, in Mariano Biasin\u2019s tender gay coming-of-age drama.",
        "source": "The New York Times",
        "headline": {
            "main": "\u2018Sublime\u2019 Review: Two Boys, One in Love",
            "kicker": null,
            "content_kicker": null,
            "print_headline": "Sublime",
            "name": null,
            "seo": null,
            "sub": null
        },
        "keywords": [
            {
                "name": "subject",
                "value": "Movies",
                "rank": 1,
                "major": "N"
            },
            {
                "name": "creative_works",
                "value": "Sublime (Movie)",
                "rank": 2,
                "major": "N"
            },
            {
                "name": "persons",
                "value": "Biasin, Mariano",
         

In [155]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
nyt_reviews_df = pd.json_normalize(reviews_list)
nyt_reviews_df.head()


Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2023/06/20/movies/subl...,"A teenager dreams of pop songs, and his best f...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-06-20T17:48:07+0000,266,"‘Sublime’ Review: Two Boys, One in Love",,,Sublime,,,,By Erik Piepenburg,"[{'firstname': 'Erik', 'middlename': None, 'la...",
1,https://www.nytimes.com/2018/01/18/movies/kang...,The documentary looks at the mass killings of ...,The New York Times,"[{'name': 'creative_works', 'value': 'Kangaroo...",2018-01-18T12:00:23+0000,263,Review: ‘Kangaroo: A Love-Hate Story’ Exposes ...,,,Kangaroo: A Love-Hate Story,,,,By Ken Jaworowski,"[{'firstname': 'Ken', 'middlename': None, 'las...",
2,https://www.nytimes.com/1991/10/11/movies/shor...,"Short-order cookery, dreams of love. Warm and ...",The New York Times,"[{'name': 'subject', 'value': 'MOTION PICTURES...",1991-10-11T05:00:00+0000,1117,Short-Order Cookery And Dreams of Love,,,Short-Order Cookery And Dreams of Love,,,,By Janet Maslin,"[{'firstname': 'Janet', 'middlename': None, 'l...",
3,https://www.nytimes.com/1989/07/12/movies/revi...,Rob Reiner’s take on romantically bruised New ...,The New York Times,"[{'name': 'subject', 'value': 'Reviews', 'rank...",1989-07-12T05:00:00+0000,942,Review/Film; It's Harry [ Loves ] Sally in a...,,,Review/Film; It's Harry [ Loves ] Sally in a...,,,,By Caryn James,"[{'firstname': 'Caryn', 'middlename': None, 'l...",
4,https://www.nytimes.com/2013/09/20/movies/jewt...,"In “Jewtopia,” a young man asks a childhood fr...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2013-09-19T23:33:15+0000,272,Love’s Eternal Masquerade,Movie Review,,Jewtopia,,,,By David DeWitt,"[{'firstname': 'David', 'middlename': None, 'l...",


In [156]:
nyt_reviews_df.value_counts("headline.main")

headline.main
50 Years and 600 Women Later, True Love                                               1
Teenager Dies, Leaving Love, and a Life, Behind                                       1
Review: Two ‘Passengers’ Trapped on a Spaceship Find Love Amid Despair                1
Review: ‘A United Kingdom’ With Love That Tested Racial Tolerance                     1
Review: ‘Kangaroo: A Love-Hate Story’ Exposes a Wildlife Massacre                     1
Review: ‘Prem Ratan Dhan Payo,’ a Bollywood Tale of a Prince-and-Plebe Double         1
Review: ‘Sophie and the Rising Sun’ Has Forbidden Love in a Foreboding Time           1
Review: ‘They’ll Love Me When I’m Dead’ Documents Orson Welles’s Last Film            1
SCREEN: LOVE IN BELFAST                                                               1
Serenading Love and Life, 30 Years After                                              1
Sexism and Rothbart As Obstacles to Love                                              1
Short-Order Cooker

In [163]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early


# Define the refined regex pattern
pattern = r'\u2018([^"]*)\u2019\s*Review'

# Apply the regex to each cell to extract the movie title
nyt_reviews_df['title'] = nyt_reviews_df['headline.main'].apply(lambda x: re.search(pattern, x).group(0) if re.search(pattern, x) else x)
nyt_reviews_df.head(30)


# pattern = r'\u2018([^"]*)\u2019\s*Review'

# Apply the adjusted regex to each row to extract the title
#nyt_reviews_df['title'] = nyt_reviews_df['headline.main'].apply(lambda x: re.search(pattern, x).group(1) if re.search(pattern, x) else None)

# Display the updated DataFrame
#print(nyt_reviews_df['title'])



# nyt_reviews_df['title'] = nyt_reviews_df['headline.main'].str.extract(r'(?:Review:\s*)?\u2018(.*?)\u2019(?=\s*Review)')
# print(nyt_reviews_df['title'])

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,https://www.nytimes.com/2023/06/20/movies/subl...,"A teenager dreams of pop songs, and his best f...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-06-20T17:48:07+0000,266,"‘Sublime’ Review: Two Boys, One in Love",,,Sublime,,,,By Erik Piepenburg,"[{'firstname': 'Erik', 'middlename': None, 'la...",,‘Sublime’ Review
1,https://www.nytimes.com/2018/01/18/movies/kang...,The documentary looks at the mass killings of ...,The New York Times,"[{'name': 'creative_works', 'value': 'Kangaroo...",2018-01-18T12:00:23+0000,263,Review: ‘Kangaroo: A Love-Hate Story’ Exposes ...,,,Kangaroo: A Love-Hate Story,,,,By Ken Jaworowski,"[{'firstname': 'Ken', 'middlename': None, 'las...",,Review: ‘Kangaroo: A Love-Hate Story’ Exposes ...
2,https://www.nytimes.com/1991/10/11/movies/shor...,"Short-order cookery, dreams of love. Warm and ...",The New York Times,"[{'name': 'subject', 'value': 'MOTION PICTURES...",1991-10-11T05:00:00+0000,1117,Short-Order Cookery And Dreams of Love,,,Short-Order Cookery And Dreams of Love,,,,By Janet Maslin,"[{'firstname': 'Janet', 'middlename': None, 'l...",,Short-Order Cookery And Dreams of Love
3,https://www.nytimes.com/1989/07/12/movies/revi...,Rob Reiner’s take on romantically bruised New ...,The New York Times,"[{'name': 'subject', 'value': 'Reviews', 'rank...",1989-07-12T05:00:00+0000,942,Review/Film; It's Harry [ Loves ] Sally in a...,,,Review/Film; It's Harry [ Loves ] Sally in a...,,,,By Caryn James,"[{'firstname': 'Caryn', 'middlename': None, 'l...",,Review/Film; It's Harry [ Loves ] Sally in a...
4,https://www.nytimes.com/2013/09/20/movies/jewt...,"In “Jewtopia,” a young man asks a childhood fr...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2013-09-19T23:33:15+0000,272,Love’s Eternal Masquerade,Movie Review,,Jewtopia,,,,By David DeWitt,"[{'firstname': 'David', 'middlename': None, 'l...",,Love’s Eternal Masquerade
5,https://www.nytimes.com/2019/05/21/movies/the-...,Blythe Danner and John Lithgow strain to eleva...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2019-05-21T11:00:01+0000,280,‘The Tomorrow Man’ Review: Love Among the Neur...,,,"They Whirl, They Twirl, They Tango",,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,‘The Tomorrow Man’ Review
6,https://www.nytimes.com/1996/01/12/movies/film...,Housewife pretends to fall for lesbian. Risque...,The New York Times,"[{'name': 'subject', 'value': 'MOTION PICTURES...",1996-01-12T05:00:00+0000,943,FILM REVIEW;Equal Opportunity in Games of Love,FILM REVIEW,,FILM REVIEW;Equal Opportunity in Games of Love,,,,By Janet Maslin,"[{'firstname': 'Janet', 'middlename': None, 'l...",,FILM REVIEW;Equal Opportunity in Games of Love
7,https://www.nytimes.com/2019/11/05/movies/marr...,Adam Driver and Scarlett Johansson self-consci...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2019-11-05T19:01:02+0000,1244,‘Marriage Story’ Review: Dance Me to the End o...,critic’s pick,,"Friendly Split, Shattering Break",,,,By A.O. Scott,"[{'firstname': 'A.', 'middlename': 'O.', 'last...",,‘Marriage Story’ Review
8,https://www.nytimes.com/2022/11/17/movies/bone...,Luca Guadagnino’s latest stars Timothée Chalam...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2022-11-17T12:00:06+0000,710,‘Bones and All’ Review: You Eat What You Are,,,"It’s Eat, Prey, Love on a Journey of Self-Disc...",,,,By A.O. Scott,"[{'firstname': 'A.', 'middlename': 'O.', 'last...",,‘Bones and All’ Review
9,https://www.nytimes.com/2019/08/08/movies/love...,"Anton Yelchin, who played Chekov in the J.J. A...",The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2019-08-08T09:00:10+0000,319,"‘Love, Antosha’ Review: A Heartbreaking Look a...",Critic’s Pick,,"Love, Antosha",,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",,"‘Love, Antosha’ Review"


In [None]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string
nyt_reviews_df['keywords'] = nyt_reviews_df['keywords'].apply(extract_keywords)


In [None]:
nyt_reviews_df.head()

In [None]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
title_list = nyt_reviews_df['title'].to_list()
title_list

### Access The Movie Database API

In [None]:
# Prepare The Movie Database query
tmdb_url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = f"&api_key={tmdb_api_key}"

In [None]:
# Create an empty list to store the results
tmdb_list = []

# Create a request counter to sleep the requests after a multiple
# of 50 requests
request_counter = 0

# Loop through the titles
for title in title_list:

    # Check if we need to sleep before making a request
    if request_counter % 50 == 0 and request_counter != 0:
        time.sleep(6)

    # Add 1 to the request counter
    request_counter += 1
    
    # Perform a "GET" request for The Movie Database
    
    tmdb_query_url = f"{tmdb_url}{title}{tmdb_key_string}"
    tmdb_titles = requests.get(tmdb_query_url).json()
    # print(json.dumps(tmdb_titles, indent=4))

    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.

    try:
        for movie in tmdb_titles['results']:
            title = movie['title']

        print(f"Found movie with title: {title}")

    except:
        print(f"Movie with title: {title} not found")
        break

        # Get movie id
        # Extract the movie id from the first result

        

        # Make a request for a the full movie details


        # Execute "GET" request with url

        
        # Extract the genre names into a list


        # Extract the spoken_languages' English name into a list


        # Extract the production_countries' name into a list


        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list

        
        # Print out the title that was found



In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [None]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [None]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [None]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [None]:
# Drop "byline.person" column


In [None]:
# Delete duplicate rows and reset index


In [None]:
# Export data to CSV without the index
