### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import re
from my_toolkit import key_check
from urllib.parse import quote_plus


In [2]:
# Set environment variables from the .env in the local environment    
my_env_path= 'C:\src\\ai\data-sourcing-challenge\.data-sourcing_challenge.env'
'''
def check_keys(key_path=None):
    try:
        load_dotenv(key_path,override=True)   
        nyt_api_key = os.getenv("NYT_API_KEY")
        tmdb_api_key = os.getenv("TMDB_API_KEY")
        assert nyt_api_key is not None, 'NYT_API_KEY not found in .env file'
        assert tmdb_api_key is not None, 'TMDB_API_KEY not found in .env file'
        responce=requests.get(f'https://api.nytimes.com/svc/mostpopular/v2/viewed/1.json?api-key={nyt_api_key}')
        assert responce.status_code == 200, f'The key provided failed to authenticate nyt_api_key {nyt_api_key} code {responce.status_code}'
        responce=requests.get(f'https://api.themoviedb.org/3/movie/11?api_key={tmdb_api_key}')
        assert responce.status_code == 200, f'The key provided failed to authenticate tmdb_api_key {tmdb_api_key} code {responce.status_code}'
    except Exception as e:
        # Handle potential errors in loading .env or missing API keys
        print(f'An error occurred: {e}')
    else:
        print('All keys laoded correctly')
'''
if key_check(my_env_path):
    nyt_api_key = os.getenv("NYT_API_KEY")
    tmdb_api_key = os.getenv("TMDB_API_KEY")
else:
    print ('fix Keys and rerun')

All keys loaded correctly


### Access the New York Times API

In [3]:
# Sample from web page
# https://api.nytimes.com/svc/search/v2/articlesearch.json?q=new+york+times&page=2&sort=oldest&api-key=your-api-key 

#  Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = quote_plus('section_name:"Movies" AND type_of_material:"Review" AND headline:"love"')

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = quote_plus("headline,web_url,snippet,source,keywords,pub_date,byline,word_count")

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
query_url=(f'{url}fq={filter_query}&begin_date={begin_date}&end_date={end_date}&fl={field_list}&sort={sort}')
display (query_url)

'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=section_name%3A%22Movies%22+AND+type_of_material%3A%22Review%22+AND+headline%3A%22love%22&begin_date=20130101&end_date=20230531&fl=headline%2Cweb_url%2Csnippet%2Csource%2Ckeywords%2Cpub_date%2Cbyline%2Cword_count&sort=newest'

In [4]:
# Create an empty list to store the reviews
love_movie_review=[]
# loop through pages 0-19
for page in range(0,20):
    # Construct the query URL for the current page
    page_url = f'{query_url}&page={page}&api-key={nyt_api_key}'

    try:
        # Attempt to make a "GET" request and parse the JSON response
        response = requests.get(page_url).json()

        # Check if the "docs" list is empty; if so, print a message and exit the loop
        if not (response["response"]["docs"]):
                raise ValueError (f'No results on page {page}, stopping.')
        else: 
            print(f'Checked page  {page}')            
        # Otherwise, process each article in "docs"
        for article in response["response"]["docs"]:
            if article :
                love_movie_review.append(article)
            # diag print(f'article title {article["headline"]["main"]}')
        # Add a twelve second pause between requests to adhere to API query limits
        time.sleep(12)
    except ValueError as e:
        # Handle the case where no documents are found
        print(e)
        break
    except Exception as e:
        # Handle potential errors in the request or data processing
        print(f'An error occurred: {e}')
        break

Checked page  0
Checked page  1
Checked page  2
Checked page  3
Checked page  4
Checked page  5
Checked page  6
Checked page  7
Checked page  8
Checked page  9
Checked page  10
Checked page  11
Checked page  12
Checked page  13
Checked page  14
Checked page  15
Checked page  16
Checked page  17
Checked page  18
Checked page  19


In [5]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print (json.dumps(love_movie_review[:5], indent=4))

[
    {
        "web_url": "https://www.nytimes.com/2023/05/25/movies/the-attachment-diaries-review.html",
        "snippet": "A gynecologist and her patient form a horrifyingly twisted connection in this batty, bloody Argentine melodrama.",
        "source": "The New York Times",
        "headline": {
            "main": "\u2018The Attachment Diaries\u2019 Review: Love, Sick",
            "kicker": null,
            "content_kicker": null,
            "print_headline": "The Attachment Diaries",
            "name": null,
            "seo": null,
            "sub": null
        },
        "keywords": [
            {
                "name": "subject",
                "value": "Movies",
                "rank": 1,
                "major": "N"
            },
            {
                "name": "creative_works",
                "value": "The Attachment Diaries (Movie)",
                "rank": 2,
                "major": "N"
            },
            {
                "name": "persons",
 

In [6]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
love_movie_review_df=pd.json_normalize(love_movie_review)
love_movie_review_df.iloc[0:4]

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-25T11:00:03+0000,295,"‘The Attachment Diaries’ Review: Love, Sick",,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
1,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T17:16:45+0000,287,Review: ‘What’s Love Got to Do With It?’ Proba...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",
2,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T11:00:08+0000,294,‘You Can Live Forever’ Review: Do You Love Me ...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",
3,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-04-21T07:03:25+0000,276,‘A Tourist’s Guide to Love’ Review: A Wearying...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",


In [7]:
# Extract the title from the "headline.main" column and
# 
# Regular expression to match text enclosed by ‘ and ’
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019.
# In testing I found that some reviews did not have titles.
# -----End string should include " Review" to avoid cutting title early----
# "" Review doesnt work "Review": ‘What’s Love Got to Do With It?’  not all titles come first. opted for u0020 before and after. 
pattern = r"(?:\u0020|^)\u2018(.+?)\u2019(?:\u003A|\u0020|$)"

def extract_title(headline):
    title_match = re.search(pattern, headline)
    if title_match:
        return title_match.group(1)
    else:
        print (headline)
        return "not found"

# Apply this function to the 'headline.main' column and assign the result to the 'title' column
love_movie_review_df['title'] = love_movie_review_df['headline.main'].apply(extract_title)	
love_movie_review_df[['snippet','title']].head(3)


Review: Those Movies, Himself — Bertrand Tavernier’s Tour of French Cinema


Unnamed: 0,snippet,title
0,A gynecologist and her patient form a horrifyi...,The Attachment Diaries
1,Two childhood friends navigate cultural differ...,What’s Love Got to Do With It?
2,Religion comes between two girls falling in lo...,You Can Live Forever


In [8]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019.
# -----End string should include " Review" to avoid cutting title early----
# 
# In testing I found that some reviews did not have titles.
# I found that some responses did not have title. The revised code prints a message with the headline  page and a message saying title not found. 
# "" Review doesnt work "Review": ‘What’s Love Got to Do With It?’  not all titles come first. 
# opted for u0020 before and after. 

pattern = r"(?:\u0020|^)\u2018(.+?)\u2019(?:\u003A|\u0020|$)"

def extract_title(headline, idx):
    title_match = re.search(pattern, headline)
    if title_match:
        # print(idx,headline)
        return title_match.group(1)
    else:
        print (f'not found {idx, headline}')
    #    return "not found"

# Apply this function to the 'headline.main' column and assign the result to the 'title' column
# Direct iteration over the column to access both the index and value
love_movie_review_df['title'] = love_movie_review_df.apply(lambda row: extract_title(row['headline.main'], row.name), axis=1)
display (love_movie_review_df[['snippet','title']].head(3))

not found (185, 'Review: Those Movies, Himself — Bertrand Tavernier’s Tour of French Cinema')


Unnamed: 0,snippet,title
0,A gynecologist and her patient form a horrifyi...,The Attachment Diaries
1,Two childhood friends navigate cultural differ...,What’s Love Got to Do With It?
2,Religion comes between two girls falling in lo...,You Can Live Forever


In [9]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    extracted_keywords = ";".join(f"{item['name']}: {item['value']}" for item in keyword_list)
    # extracted_keywords = extracted_keywords.rstrip(';')
    return extracted_keywords
# this copy make it possible for me to rerun this without having to start from scratch
lmr_keyword_df = love_movie_review_df.copy(deep=True)

# Fix the "keywords" column by converting cells from a list to a string
lmr_keyword_df['keywords'] = lmr_keyword_df['keywords'].apply(extract_keywords)
# display(lmr_keyword_df[['title','keywords']].head(3))
#
#play time with style
styled_subset_df=lmr_keyword_df.loc[:4,['title','keywords']]
styled_df = styled_subset_df.style.set_table_styles({
    'title': [{'selector': '',
                'props': [('width', '200px'), ('text-align', 'right')]}],
    'keywords': [{'selector': '',
                'props': [('width', '700px'), ('text-align', 'left')]}]
}, overwrite=False).hide(axis=0)
# display(styled_df)
display (lmr_keyword_df)
display (styled_df)
del styled_df

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,The New York Times,subject: Movies;creative_works: The Attachment...,2023-05-25T11:00:03+0000,295,"‘The Attachment Diaries’ Review: Love, Sick",,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries
1,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,The New York Times,"subject: Movies;persons: Kapur, Shekhar;person...",2023-05-04T17:16:45+0000,287,Review: ‘What’s Love Got to Do With It?’ Proba...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,What’s Love Got to Do With It?
2,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,The New York Times,subject: Movies;creative_works: You Can Live F...,2023-05-04T11:00:08+0000,294,‘You Can Live Forever’ Review: Do You Love Me ...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,You Can Live Forever
3,https://www.nytimes.com/2023/04/21/movies/a-to...,Rachael Leigh Cook stars in this bland rom-com...,The New York Times,subject: Movies;creative_works: A Tourist's Gu...,2023-04-21T07:03:25+0000,276,‘A Tourist’s Guide to Love’ Review: A Wearying...,,,A Tourist’s Guide to Love,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,A Tourist’s Guide to Love
4,https://www.nytimes.com/2023/04/20/movies/othe...,A radiant Virginie Efira stars as a Parisian t...,The New York Times,"subject: Movies;persons: Zlotowski, Rebecca;cr...",2023-04-20T15:35:13+0000,801,‘Other People’s Children’ Review: True Romance,Critic’s pick,,Intoxicating Love With a Sobering Turn,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",,Other People’s Children
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,https://www.nytimes.com/2017/03/09/movies/the-...,This moody romance stars Tatiana Maslany (“Orp...,The New York Times,subject: Movies;creative_works: The Other Half...,2017-03-09T21:54:58+0000,251,Review: A Combustible Pair Find Love in ‘The O...,,,Review: A Combustible Pair Find Love in ‘The O...,,,,By Andy Webster,"[{'firstname': 'Andy', 'middlename': None, 'la...",,The Other Half
196,https://www.nytimes.com/2017/03/09/movies/revi...,A nurse travels to the Ottoman Empire on the e...,The New York Times,subject: Movies;creative_works: The Ottoman Li...,2017-03-09T21:53:12+0000,267,"Review: Love as the World Wars, in ‘The Ottoma...",,,"Review: Love as the World Wars, in ‘The Ottoma...",,,,By Neil Genzlinger,"[{'firstname': 'Neil', 'middlename': None, 'la...",,The Ottoman Lieutenant
197,https://www.nytimes.com/2017/03/02/movies/love...,Josh Kornbluth runs afoul of the Internal Reve...,The New York Times,creative_works: Love & Taxes (Movie);persons: ...,2017-03-02T21:44:18+0000,246,Review: It’s All Mirth and Taxes in ‘Love & Ta...,,,"It’s Inevitable, Mirth and Taxes",,,,By Ken Jaworowski,"[{'firstname': 'Ken', 'middlename': None, 'las...",,Love & Taxes
198,https://www.nytimes.com/2017/02/16/movies/ever...,A messed-up heroine is asked to choose between...,The New York Times,subject: Movies;creative_works: Everybody Love...,2017-02-16T21:45:50+0000,256,"Review: ‘Everybody Loves Somebody,’ a Rom-Com ...",,,Everybody Loves Somebody,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,"Everybody Loves Somebody,"


title,keywords
The Attachment Diaries,"subject: Movies;creative_works: The Attachment Diaries (Movie);persons: Diment, Valentin Javier"
What’s Love Got to Do With It?,"subject: Movies;persons: Kapur, Shekhar;persons: James, Lily;persons: Azmi, Shabana;persons: Thompson, Emma"
You Can Live Forever,"subject: Movies;creative_works: You Can Live Forever (Movie);persons: Slutsky, Mark;persons: Watts, Sarah (Film Director);persons: O'Driscoll, Anwen;persons: Laporte, June (Actor)"
A Tourist’s Guide to Love,"subject: Movies;creative_works: A Tourist's Guide to Love (Movie);persons: Tsuchida, Steven;persons: Cook, Rachael Leigh"
Other People’s Children,"subject: Movies;persons: Zlotowski, Rebecca;creative_works: Other People's Children (Movie)"


In [10]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
titles_list = love_movie_review_df['title'].to_list()
print("Top 5 Titles:\n" + json.dumps(titles_list[:5], indent= 4, ensure_ascii=False)[1:-1])

Top 5 Titles:

    "The Attachment Diaries",
    "What’s Love Got to Do With It?",
    "You Can Live Forever",
    "A Tourist’s Guide to Love",
    "Other People’s Children"



### Access The Movie Database API

In [11]:
# Prepare The Movie Database query
url_query   = "https://api.themoviedb.org/3/search/movie?query="
url_detail  = "https://api.themoviedb.org/3/movie/"

In [13]:
# Create an empty list to store the results
tmbd_movies_list=[]

# Create a request counter to sleep the requests after a multiple
# of 50 requests
req_counter = 0
'''
https://developer.themoviedb.org/docs/rate-limiting
While our legacy rate limits have been disabled for some time, 
we do still have some upper limits to help mitigate needlessly high bulk scraping. 
They sit somewhere in the 50 requests per second range. 
This limit could change at any time so be respectful of the service we have built and respect the 429 if you receive one.
'''
# Loop through the titles
for title in titles_list:
    # Check if we need to sleep before making a request
    if req_counter == 25:
        print ('naptime')
        time.sleep(1)
        req_counter=0
    else:
    # Add 1 to the request counter
        req_counter += 1
    # Perform a "GET" request for The Movie Database
    response=requests.get(f'{url_query}{title}"&api_key={tmdb_api_key}')

    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.
    try:
        # Get movie id    
        if response.status_code == 200:
        # Get movie id
            data = response.json()
            if data['results']:
                movie_id = data['results'][0]['id']
                # print(f"Movie ID Found: {movie_id}  {req_counter}  {title}")
        # Make a request for the full movie details
        # Execute "GET" request with url
                detail_response = requests.get(f"{url_detail}{movie_id}?api_key={tmdb_api_key}")
                if detail_response.status_code == 200:
                    movie_detail_df = detail_response.json()
                    # display (detail_response.json())
                    # print (json.dumps(movie_detail_df,indent=4))

        # Extract the genre names into a list
                    genres_list = [genre['name'] for genre in movie_detail_df['genres']]
                    # print (genres_list)
        # Extract the spoken_languages' English name into a list
                    spoken_languages = [spoken_language['english_name'] for spoken_language in movie_detail_df['spoken_languages']]
                    # print(spoken_languages)

        # Sample 'spoken_languages': [{'english_name': 'Spanish', 'iso_639_1': 'es', 'name': 'Español'}],
        # Extract the production_countries' name into a list
                    production_countries_name = [production_countries_name['name'] for production_countries_name in movie_detail_df['production_countries']]
                    
                    # 'production_countries': [{'iso_3166_1': 'AR', 'name': 'Argentina'}],
                    # print(production_countries_name)
        # Add the relevant data to a dictionary and
        
        # append it to the tmdb_movies_list list
                    tmbd_movies_list.append({
                    'movie_id'        : movie_id,
                    'movie_title'     : title,
                    'original_title'  : (movie_detail_df['original_title']),
                    'budget'          : (movie_detail_df['budget']),
                    'gendre_list'     : genres_list,
                    'original_language'        : (movie_detail_df['original_language']),
                    'spoken_languages': spoken_languages,
                    'homepage'        : (movie_detail_df['homepage']),
                    'overview'        : (movie_detail_df['overview']),
                    'popularity'      : (movie_detail_df['popularity']),
                    'runtime'         : (movie_detail_df['runtime']),
                    'revenue'         : (movie_detail_df['revenue']),
                    'release_date'    : (movie_detail_df['release_date']),
                    'vote_average'    : (movie_detail_df['vote_average']),
                    'vote_count'      : (movie_detail_df['vote_count']),
                    'production_counties_name': production_countries_name})
                    
                else:
                    raise ("put a exception here") 
        # Print out the title that was found
            print (f'Found  {title}')
        else:
            print(f"Failed to get movie ID, status code: {response.status_code}      {title}")
        
    except requests.exceptions.RequestException as e:
    # This catches all exceptions that are requests-related
    # Including connection errors, timeouts, etc.
        print("A network error occurred. Please try again later.")
        print(e)
display (tmbd_movies_list)


Found  The Attachment Diaries
Found  What’s Love Got to Do With It?
Found  You Can Live Forever
Found  A Tourist’s Guide to Love
Found  Other People’s Children
Found  One True Loves
Found  The Lost Weekend: A Love Story
Found  A Thousand and One
Found  Your Place or Mine
Found  Love in the Time of Fentanyl
Found  Pamela, a Love Story
Found  In From the Side
Found  After Love
Found  Alcarràs
Found  Nelly & Nadine
Found  Lady Chatterley’s Lover
Found  The Sound of Christmas
Found  The Inspection
Found  Bones and All
Found  My Policeman
Found  About Fate
Found  Waiting for Bojangles
Found  I Love My Dad
Found  A Love Song
Found  Alone Together
naptime
Found  Art of Love
Found  The Wheel
Found  Thor: Love and Thunder
Found  Both Sides of the Blade
Found  Fire of Love
Found  Love & Gelato
Found  Stay Prayed Up
Found  Benediction
Found  Dinner in America
Found  In a New York Minute
Found  Anaïs in Love
Found  I Love America
Found  See You Then
Found  La Mami
Found  Love After Love
Found  Dee

[{'movie_id': 743040,
  'movie_title': 'The Attachment Diaries',
  'original_title': 'El apego',
  'budget': 0,
  'gendre_list': ['Drama', 'Mystery', 'Thriller', 'Horror'],
  'original_language': 'es',
  'spoken_languages': ['Spanish'],
  'homepage': '',
  'overview': 'Argentina, 1970s. A desperate young woman goes to a clinic to have a clandestine abortion. As her pregnancy is already through the fourth month, the doctor refuses. Instead, she proposes to sell the baby to one of her clients and offers to provide shelter in her house until the child is born. Their disturbed personalities will become intertwined in a strange and dangerous relationship.',
  'popularity': 1.661,
  'runtime': 102,
  'revenue': 0,
  'release_date': '2021-10-07',
  'vote_average': 3.0,
  'vote_count': 3,
  'production_counties_name': ['Argentina']},
 {'movie_id': 800301,
  'movie_title': 'What’s Love Got to Do With It?',
  'original_title': "What's Love Got to Do with It?",
  'budget': 0,
  'gendre_list': ['R

In [14]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print (json.dumps(tmbd_movies_list[:5],indent=4))

[
    {
        "movie_id": 743040,
        "movie_title": "The Attachment Diaries",
        "original_title": "El apego",
        "budget": 0,
        "gendre_list": [
            "Drama",
            "Mystery",
            "Thriller",
            "Horror"
        ],
        "original_language": "es",
        "spoken_languages": [
            "Spanish"
        ],
        "homepage": "",
        "overview": "Argentina, 1970s. A desperate young woman goes to a clinic to have a clandestine abortion. As her pregnancy is already through the fourth month, the doctor refuses. Instead, she proposes to sell the baby to one of her clients and offers to provide shelter in her house until the child is born. Their disturbed personalities will become intertwined in a strange and dangerous relationship.",
        "popularity": 1.661,
        "runtime": 102,
        "revenue": 0,
        "release_date": "2021-10-07",
        "vote_average": 3.0,
        "vote_count": 3,
        "production_counties_n

In [15]:
# Convert the results to a DataFrame
tmdb_movies_df = pd.DataFrame(tmbd_movies_list)
display (tmdb_movies_df.head(3))
display (love_movie_review_df.head(3))


Unnamed: 0,movie_id,movie_title,original_title,budget,gendre_list,original_language,spoken_languages,homepage,overview,popularity,runtime,revenue,release_date,vote_average,vote_count,production_counties_name
0,743040,The Attachment Diaries,El apego,0,"[Drama, Mystery, Thriller, Horror]",es,[Spanish],,"Argentina, 1970s. A desperate young woman goes...",1.661,102,0,2021-10-07,3.0,3,[Argentina]
1,800301,What’s Love Got to Do With It?,What's Love Got to Do with It?,0,"[Romance, Comedy]",en,"[English, Portuguese, Urdu]",,Two childhood friends now in their thirties mu...,19.378,109,10898395,2023-01-26,6.1,160,"[France, United Kingdom]"
2,887580,You Can Live Forever,You Can Live Forever,0,"[Drama, Romance]",en,"[English, French]",https://gooddeedentertainment.com/you-can-live...,"When Jaime, a gay teenager, is sent to live in...",59.685,96,15055,2023-03-24,6.6,31,"[Canada, United States of America]"


Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,https://www.nytimes.com/2023/05/25/movies/the-...,A gynecologist and her patient form a horrifyi...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-25T11:00:03+0000,295,"‘The Attachment Diaries’ Review: Love, Sick",,,The Attachment Diaries,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,The Attachment Diaries
1,https://www.nytimes.com/2023/05/04/movies/what...,Two childhood friends navigate cultural differ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T17:16:45+0000,287,Review: ‘What’s Love Got to Do With It?’ Proba...,,,What’s Love Got to Do With It?,,,,By Jeannette Catsoulis,"[{'firstname': 'Jeannette', 'middlename': None...",,What’s Love Got to Do With It?
2,https://www.nytimes.com/2023/05/04/movies/you-...,Religion comes between two girls falling in lo...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-04T11:00:08+0000,294,‘You Can Live Forever’ Review: Do You Love Me ...,,,You Can Live Forever,,,,By Elisabeth Vincentelli,"[{'firstname': 'Elisabeth', 'middlename': None...",,You Can Live Forever


### Merge and Clean the Data for Export

In [23]:
# Merge the New York Times reviews and TMDB DataFrames on title
tmdb_movies_df.rename(columns={'movie_title':'title'}, inplace=True)
merged_df = pd.merge(tmdb_movies_df, love_movie_review_df, on='title')
nyt_tmdb_df = merged_df.copy()
print (merged_df.columns)
print (merged_df.head(2))

Index(['movie_id', 'title', 'original_title', 'budget', 'gendre_list',
       'original_language', 'spoken_languages', 'homepage', 'overview',
       'popularity', 'runtime', 'revenue', 'release_date', 'vote_average',
       'vote_count', 'production_counties_name', 'web_url', 'snippet',
       'source', 'keywords', 'pub_date', 'word_count', 'headline.main',
       'headline.kicker', 'headline.content_kicker', 'headline.print_headline',
       'headline.name', 'headline.seo', 'headline.sub', 'byline.original',
       'byline.person', 'byline.organization'],
      dtype='object')
   movie_id                           title                  original_title  \
0    743040          The Attachment Diaries                        El apego   
1    800301  What’s Love Got to Do With It?  What's Love Got to Do with It?   

   budget                         gendre_list original_language  \
0       0  [Drama, Mystery, Thriller, Horror]                es   
1       0                   [Romance, Come

In [26]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing
def contains_list(series):
    return series.map(lambda x: isinstance(x, list)).any()

columns_with_lists = [col for col in nyt_tmdb_df.columns if contains_list(nyt_tmdb_df[col])]

print  (f'\n{columns_with_lists}\n' )
# Create a list of characters to remove
def clean_list_string(s):
    if isinstance(s, list):
        # Convert list to string and remove unwanted characters
        return str(s).strip('[]').replace('"', '').replace("'", "")
    return s

# Apply the cleaning directly, no need for preliminary list check
for col in columns_with_lists:
   nyt_tmdb_df[col] = nyt_tmdb_df[col].apply(clean_list_string)

# Display the fixed DataFrame
print (nyt_tmdb_df.head(2))
print (nyt_tmdb_df.columns)



[]

   movie_id                           title                  original_title  \
0    743040          The Attachment Diaries                        El apego   
1    800301  What’s Love Got to Do With It?  What's Love Got to Do with It?   

   budget                       gendre_list original_language  \
0       0  Drama, Mystery, Thriller, Horror                es   
1       0                   Romance, Comedy                en   

            spoken_languages homepage  \
0                    Spanish            
1  English, Portuguese, Urdu            

                                            overview  popularity  ...  \
0  Argentina, 1970s. A desperate young woman goes...       1.661  ...   
1  Two childhood friends now in their thirties mu...      19.378  ...   

                                       headline.main  headline.kicker  \
0        ‘The Attachment Diaries’ Review: Love, Sick             None   
1  Review: ‘What’s Love Got to Do With It?’ Proba...             None  

In [27]:
# Drop "byline.person" column
nyt_tmdb_df.drop(columns=['byline.person'], inplace=True)

In [29]:
# Delete duplicate rows and reset index
print(nyt_tmdb_df[nyt_tmdb_df.duplicated(keep=False)])
#duplicates = nyt_tmdb_df.duplicated()
#print(duplicates)

    movie_id       title original_title  budget gendre_list original_language  \
12    660002  After Love           第一炉香       0       Drama                zh   
13    660002  After Love           第一炉香       0       Drama                zh   
14    660002  After Love           第一炉香       0       Drama                zh   
15    660002  After Love           第一炉香       0       Drama                zh   

   spoken_languages homepage  \
12         Mandarin            
13         Mandarin            
14         Mandarin            
15         Mandarin            

                                             overview  popularity  ...  \
12  The film tells the story of a young girl who t...       5.443  ...   
13  The film tells the story of a young girl who t...       5.443  ...   
14  The film tells the story of a young girl who t...       5.443  ...   
15  The film tells the story of a young girl who t...       5.443  ...   

    word_count                                      headline.m

In [20]:
# Export data to CSV without the index
