### Import Required Libraries and Set Up Environment Variables

In [23]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [25]:
# Set environment variables from the .env in the local environment
load_dotenv('.env')

nyt_api_key = os.getenv('NYT')
tmdb_api_key = os.getenv('TMDB')

# I really struggled to get the API's to work. 
#I used AskBCS where they were finally able to help me get this figured out. 
# print(nyt_api_key, tmdb_api_key)

### Access the New York Times API

In [28]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
query_url = (
    f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
    + f'&fq={filter_query}&sort={sort}&fl={field_list}'
)

In [29]:
# Create an empty list to store the reviews
reviews_list = []

# loop through pages 0-19
for page_number in range(20):   

    # create query with a page number
    query_url_with_page = f'{query_url}&page={page_number}'
       
    # API results show 10 articles at a time
    print(f"Checked page: {page_number}")
    
    # Make a "GET" request and retrieve the JSON
    reviews = requests.get(query_url_with_page).json()
            
    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)
   
    try:
        # If there are results, loop through and save each review
        for review in reviews["response"]["docs"]:
            reviews_list.append(reviews)
    
        # Print the reviews added from the current page
        print(f"Page {page_number} reviews added to list.")
    except: 
        print(f"No results. Ended at page {page}.")
        break 
# I used Xpert and ChatGPT and looking at homework to generate this code

Checked page: 0
Page 0 reviews added to list.
Checked page: 1
Page 1 reviews added to list.
Checked page: 2
Page 2 reviews added to list.
Checked page: 3
Page 3 reviews added to list.
Checked page: 4
Page 4 reviews added to list.
Checked page: 5
Page 5 reviews added to list.
Checked page: 6
Page 6 reviews added to list.
Checked page: 7
Page 7 reviews added to list.
Checked page: 8
Page 8 reviews added to list.
Checked page: 9
Page 9 reviews added to list.
Checked page: 10
Page 10 reviews added to list.
Checked page: 11
Page 11 reviews added to list.
Checked page: 12
Page 12 reviews added to list.
Checked page: 13
Page 13 reviews added to list.
Checked page: 14
Page 14 reviews added to list.
Checked page: 15
Page 15 reviews added to list.
Checked page: 16
Page 16 reviews added to list.
Checked page: 17
Page 17 reviews added to list.
Checked page: 18
Page 18 reviews added to list.
Checked page: 19
Page 19 reviews added to list.


In [127]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data

reviews_df=json.loads(json.dumps(reviews_list[:5], indent=4))[0]["response"]["docs"]
reviews_df

[{'web_url': 'https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html',
  'snippet': 'A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.',
  'source': 'The New York Times',
  'headline': {'main': '‘The Attachment Diaries’ Review: Love, Sick',
   'kicker': None,
   'content_kicker': None,
   'print_headline': 'The Attachment Diaries',
   'name': None,
   'seo': None,
   'sub': None},
  'keywords': [{'name': 'subject', 'value': 'Movies', 'rank': 1, 'major': 'N'},
   {'name': 'creative_works',
    'value': 'The Attachment Diaries (Movie)',
    'rank': 2,
    'major': 'N'},
   {'name': 'persons',
    'value': 'Diment, Valentin Javier',
    'rank': 3,
    'major': 'N'}],
  'pub_date': '2023-05-25T11:00:03+0000',
  'byline': {'original': 'By Jeannette Catsoulis',
   'person': [{'firstname': 'Jeannette',
     'middlename': None,
     'lastname': 'Catsoulis',
     'qualifier': None,
     'title': None,
     'rol

In [129]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
reviews_df = pd.json_normalize(reviews_df)

# Display the DataFrame
reviews_df

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-25T11:00:03+0000,295,"‘The Attachment Diaries’ Review: Love, Sick",,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
1,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T17:16:45+0000,287,Review: ‘What’s Love Got to Do With It?’ Proba...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
2,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T11:00:08+0000,294,‘You Can Live Forever’ Review: Do You Love Me ...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",
3,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-21T07:03:25+0000,276,‘A Tourist’s Guide to Love’ Review: A Wearying...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",
4,https://www.nytimes.com/2023/04/20/movies/othe...,A radiant Virginie Efira stars as a Parisian t...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-20T15:35:13+0000,801,‘Other People’s Children’ Review: True Romance,Critic’s pick,,Intoxicating Love With a Sobering Turn,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",
5,https://www.nytimes.com/2023/04/13/movies/one-...,A film adaptation of Taylor Jenkins Reid’s nov...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-13T11:00:06+0000,320,‘One True Loves’ Review: A Romance Lost at Sea,,,One True Loves,,,,By Brandon Yu,"[{'firstname': 'Brandon', 'middlename': None, ...",
6,https://www.nytimes.com/2023/04/13/movies/the-...,There’s not much Lennon music heard in this do...,The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2023-04-13T11:00:03+0000,327,‘The Lost Weekend: A Love Story’ Review: When ...,,,The Lost Weekend: A Love Story,,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",
7,https://www.nytimes.com/2023/03/30/movies/a-th...,A mesmerizing Teyana Taylor stars in A.V. Rock...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-03-30T18:53:42+0000,971,‘A Thousand and One’ Review: A New York Love S...,Critic’s Pick,,An Unbending Will Meets a Shifting City,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",
8,https://www.nytimes.com/2023/02/09/movies/your...,This humdrum Netflix romantic comedy features ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-02-10T00:00:05+0000,569,‘Your Place or Mine’ Review: Try Neither,,,They Have a Humdrum Kind of Love,,,,By Amy Nicholson,"[{'firstname': 'Amy', 'middlename': None, 'las...",
9,https://www.nytimes.com/2023/02/02/movies/love...,"To combat the overdose crisis, a group that in...",The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2023-02-02T12:00:11+0000,306,‘Love in the Time of Fentanyl’ Review: Heartbr...,,,Love in the Time Of Fentanyl,,,,By Concepción de León,"[{'firstname': 'Concepción', 'middlename': Non...",


In [131]:
# Trying to see how many rows and columns the data has
rows, columns = reviews_df.shape
print(f"The DataFrame has {rows} rows and {columns} columns.")



The DataFrame has 10 rows and 16 columns.


In [137]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()

from pandas import json_normalize

# Convert reviews_list to a Pandas DataFrame
reviews_df= json_normalize(reviews_list)

# Display the DataFrame
reviews_df

Unnamed: 0,status,copyright,response.docs,response.meta.hits,response.meta.offset,response.meta.time
0,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
1,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
2,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
3,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
4,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
...,...,...,...,...,...,...
195,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2017/05/...,344,190,18
196,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2017/05/...,344,190,18
197,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2017/05/...,344,190,18
198,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2017/05/...,344,190,18


In [48]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()

import pandas as pd
from pandas import json_normalize

# Convert results_list to a Pandas DataFrame
reviews_df = json_normalize(reviews_list)

# Display the DataFrame
reviews_df





Unnamed: 0,status,copyright,response.docs,response.meta.hits,response.meta.offset,response.meta.time
0,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
1,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
2,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
3,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
4,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2023/05/...,344,0,22
...,...,...,...,...,...,...
195,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2017/05/...,344,190,18
196,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2017/05/...,344,190,18
197,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2017/05/...,344,190,18
198,OK,Copyright (c) 2024 The New York Times Company....,[{'web_url': 'https://www.nytimes.com/2017/05/...,344,190,18


In [98]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early
reviews_df["title"] = reviews_df["headline.main"].apply(lambda st: st[st.find("\u2018")+1:st.find("\u2019 Review")])
reviews_df


KeyError: 'headline.main'

In [100]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string
reviews_df['keywords'] = reviews_df['keywords'].apply(extract_keywords)

reviews_df

KeyError: 'keywords'

In [105]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
title_list = reviews_df['title'].to_list()
title_list

KeyError: 'title'

### Access The Movie Database API

In [107]:
# Prepare The Movie Database query
nyt_api_key = os.getenv('NYT')
tmdb_api_key = os.getenv('TMDB')

url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key


In [79]:
    # Perform a "GET" request for The Movie Database
reviews = requests.get(author_query_url + author)

    # Include a try clause to search for the full movie details.
if not reviews["response"]["docs"]:
        print(f"No results on page {page_number}")
        break  # Break from the loop if no results
    try:
        # If there are results, loop through and save each review
        for review in reviews["response"]["docs"]:
            reviews_list.append(reviews)
  
       # Use the except clause to print out a statement if a movie
    # is not found.
  except: 
      print(f"{movie_ID} reviews added to list.")

# Create an empty list to store the results
tmdb_movies_list = []

# Create a request counter to sleep the requests after a multiple
# of 50 requests
request_counter = 1

# Loop through the titles
for title in title_list:
    # Check if we need to sleep before making a request
    if request_counter % 50 == 0 and request_counter != 0:
        time.sleep(1)  # Sleep for 1 second after every 50 requests

    # Increment the request counter
    request_counter += 1

    # Perform a "GET" request to The Movie Database
    results = requests.get(query_url_with_page).json()
    response = requests.get(f"{url}?api_key={tmdb_key_string}&query={title}")
    response_data = response.json()

    try:
        # Get movie ID
        movie_ID = response_data['results'][0]['id']
        
        # Make a request for the full movie details
        details_url = f"https://api.themoviedb.org/3/movie/{movie_ID}?api_key={tmdb_key_string}"
        full_response = requests.get(details_url)
        full_data = full_response.json()

        # Extract relevant details
        genres = [genre['name'] for genre in full_data.get('genres', [])]
        spoken_languages = [lang['english_name'] for lang in full_data.get('spoken_languages', [])]
        production_countries = [country['name'] for country in full_data.get('production_countries', [])]

        # Store the relevant data in a dictionary
        movie_info = {
            "title": title,
            "original_title": original_title,
            "budget": budget
            "genres": genres,
            "spoken_languages": spoken_languages,
            "production_countries": production_countries
        }

        # Append the dictionary to the results list
        mov_results_list.append(movie_info)

        # Print out the title that was found
        print(f"Found {title}")

    except (IndexError, KeyError):
        # Handle the case where the movie is not found or there is an issue with the response
        print(f"{title} not found or there was an error retrieving the data.")

# The mov_results_list now contains the extracted movie data


IndentationError: expected an indented block after 'for' statement on line 17 (3893703901.py, line 19)

In [None]:
# Create an empty list to store the results
mov_results_list = []

# Create a request counter to sleep the requests after a multiple
# of 50 requests

request_counter = 0

# Loop for making requests
for i in range(100):  # Assuming 100 requests need to be made
    # Make the API request here
    request_counter += 1
    
    if request_counter % 50 == 0:
        time.sleep(1)  # Sleep for 1 second after every 50 requests

# Loop through the titles; title_list
  

    # Check if we need to sleep before making a request


    # Add 1 to the request counter

    
    # Perform a "GET" request for The Movie Database


    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.
    try:
        # Get movie id
        movie_ID = response_data['results'][0]['id']

        # Make a request for a the full movie details
        response _data = requests.get(url + titles + tmdb_key_string).json()

        # Execute "GET" request with url

        
        # Extract the genre names into a list


        # Extract the spoken_languages' English name into a list


        # Extract the production_countries' name into a list


        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list

        
        # Print out the title that was found
        except:
            print(titles+"not found")


In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


# for review in results_list[:5]:
 #   print(json.dumps(results, indent=4))  


In [None]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [None]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [None]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [None]:
# Drop "byline.person" column


In [None]:
# Delete duplicate rows and reset index


In [None]:
# Export data to CSV without the index
