### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import re
from my_toolkit import key_check

In [11]:
# Set environment variables from the .env in the local environment    
my_env= 'C:\src\\ai\data-sourcing-challenge\data-sourcing_challenge.env'
my_wrong_env = 'C:\src\\ai\.envt'
'''
def check_keys(key_path=None):
    try:
        load_dotenv(key_path,override=True)   
        nyt_api_key = os.getenv("NYT_API_KEY")
        tmdb_api_key = os.getenv("TMDB_API_KEY")
        assert nyt_api_key is not None, 'NYT_API_KEY not found in .env file'
        assert tmdb_api_key is not None, 'TMDB_API_KEY not found in .env file'
        responce=requests.get(f'https://api.nytimes.com/svc/mostpopular/v2/viewed/1.json?api-key={nyt_api_key}')
        assert responce.status_code == 200, f'The key provided failed to authenticate nyt_api_key {nyt_api_key} code {responce.status_code}'
        responce=requests.get(f'https://api.themoviedb.org/3/movie/11?api_key={tmdb_api_key}')
        assert responce.status_code == 200, f'The key provided failed to authenticate tmdb_api_key {tmdb_api_key} code {responce.status_code}'
    except Exception as e:
        # Handle potential errors in loading .env or missing API keys
        print(f'An error occurred: {e}')
    else:
        print('All keys laoded correctly')
'''
if key_check(my_env):
    nyt_api_key = os.getenv("NYT_API_KEY")
    tmdb_api_key = os.getenv("TMDB_API_KEY")
else:
    print ('fix Keys and rerun')

All keys laoded correctly


### Access the New York Times API

In [None]:
# Sample from web page
# https://api.nytimes.com/svc/search/v2/articlesearch.json?q=new+york+times&page=2&sort=oldest&api-key=your-api-key 

#  Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
query_url=(f'{url}fq={filter_query}&begin_date={begin_date}&end_date={end_date}&fl={field_list}&sort={sort}')
print (query_url)

In [None]:
# Create an empty list to store the reviews

love_movie_review = []
nyt_apai_key = "null"
for page in range(0, 1):
    # Create query with a page number
    page_url = f'{query_url}&page={page}&api-key={nyt_apai_key}'

    try:
        # Make a "GET" request and retrieve the JSON
        articles = requests.get(page_url).json()
        print(f'Page {page}:', articles)

        # Implement a pause to stay within API query limits
        time.sleep(12)

        # Check if there are articles to process
        if not articles["response"]["docs"]:
            print(f'Page {page} had no results')
            break
        else:
            # Process each article found
            for article in articles["response"]["docs"]:
                love_movie_review.append(article)

    except Exception as e:
        # Handle exceptions such as connection errors or JSON decoding errors
        print(f'An error occurred on page {page}: {e}')
        break

In [None]:

for page in range(0, 10):
    # Construct the query URL for the current page
    page_url = f'{query_url}&page={page}&api-key={nyt_api_key}'

    try:
        # Attempt to make a "GET" request and parse the JSON response
        response = requests.get(page_url).json()
        print(f'Page {page}:', response)

        # Check if the "docs" list is empty; if so, print a message and exit the loop
        if not response["response"]["docs"]:
            print(f'No results on page {page}, stopping.')
            break

        # Otherwise, process each article in "docs"
        for article in response["response"]["docs"]:
            love_movie_review.append(article)
        # Add a twelve second pause between requests to adhere to API query limits
        time.sleep(12)

    except Exception as e:
        # Handle potential errors in the request or data processing
        print(f'An error occurred: {e}')
        break

In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print (json.dumps(love_movie_review[:5], indent=4))

In [None]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
love_movie_review_df=pd.json_normalize(love_movie_review)
love_movie_review_df.head(5)

In [None]:
# Extract the title from the "headline.main" column and
# Regular expression to match text enclosed by ‘ and ’
pattern = r"(?:\u0020|^)\u2018(.+?)\u2019(?:\u003A|\u0020|$)"
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# # Use re.search to find a match to the pattern
for idx, row in love_movie_review_df.iterrows():
    title = re.search(pattern, row['headline.main'])
    if title:
        love_movie_review_df.loc[idx,'title']=title.group(1)
        #print (f"{idx}  {love_movie_review_df.loc[idx,'headline.main']}\t\t Title: {love_movie_review_df.loc[idx,'title']}")
    else:
        print (f'{idx}  Title not found')
        print (f"{idx}  {love_movie_review_df.loc[idx,'headline.main']}\t\t Title: {love_movie_review_df.loc[idx,'title']}")
        love_movie_review_df.loc[idx,'title'] = "not found"

# -----End string should include " Review" to avoid cutting title early----
# "" Revie"w doesnt work "Review: ‘What’s Love Got to Do With It?’ Proba..." not all titles come first. opted for u0020 before and after. 	


In [None]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string

for idx, row in love_movie_review_df.iterrows():
    love_movie_review_df.loc[idx,'keywords']=extract_keywords(row['keywords'])
    print(f"{idx}   {love_movie_review_df.loc[idx,'keywords']}")
    # test =extract_keywords(row['keywords'])
    # print (love_movie_review_df.head(1).loc[idx,'keywords'])



In [None]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
titles_list = love_movie_review_df['title'].to_list()
titles_list


### Access The Movie Database API

In [None]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key
tmdb_key_string1 = "?api_key=" + tmdb_api_key

In [None]:
# Create an empty list to store the results
empty_list=[]

# Create a request counter to sleep the requests after a multiple
# of 50 requests
req_counter = 0

# Loop through the titles
for title in titles_list:
    # Check if we need to sleep before making a request
    if req_counter > 18:
        time.sleep(15)
        req_counter=0
    else:
    # Add 1 to the request counter
        req_counter += 1

    # Perform a "GET" request for The Movie Database


    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.
    try:
        # Get movie id    
        movie_id=requests.get(f'https://api.themoviedb.org/3/search/movie?query={title}{tmdb_key_string}').json()['results'][0]['id']
        if response.status_code == 200:
            print(f"Movie ID Found: {movie_id}")            
        # Make a request for a the full movie details
            movie_detail_df=requests.get(f"https://api.themoviedb.org/3/movie/{movie_id}{tmdb_key_string1}").json()

        # Execute "GET" request with url

            print (movie_detail_df)
        
        # Extract the genre names into a list

        # Extract the spoken_languages' English name into a list

        # Extract the production_countries' name into a list

        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list

        
        # Print out the title that was found
            print (f'{req_counter}  {title}')
    
    
        else:
            print ("next")
    except requests.exceptions.RequestException as e:
    # This catches all exceptions that are requests-related
    # Including connection errors, timeouts, etc.
        print("A network error occurred. Please try again later.")
        print(e)


In [None]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [None]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [None]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [None]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [None]:
# Drop "byline.person" column


In [None]:
# Delete duplicate rows and reset index


In [None]:
# Export data to CSV without the index
