In [4]:
# install dependencies
%pip install -r ../requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 4.3 MB/s eta 0:00:01
[?25hCollecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[K     |████████████████████████████████| 294 kB 17.1 MB/s eta 0:00:01
Collecting tzdata>=2022.7
  Downloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 34.2 MB/s eta 0:00:01
[?25hCollecting numpy>=1.22.4
  Downloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 80.2 MB/s eta 0:00:01
[?25hCollecting pytz>=2020.1
  Downloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
[K     |████████████████████████████████| 508 kB 56.4 MB/s eta 0:00:01
Collecting matplotlib!=3.6.1,>=3.4
  Downloading matplotlib-3.9.2-cp39-cp39-macosx_11_0_arm64.whl (7.8 MB)
[K     |███████████

In [27]:
# setup
import os
import pandas as pd
import tmdbsimple as tmdb
from typing import Dict
from IPython.display import display


tmdb.API_KEY = os.environ["TMDB_API_KEY"]

In [19]:
# read file
movie_list = pd.read_csv('../data/movie_list.csv')

watched_movies = movie_list.loc[movie_list.Completed.notnull()]

display(watched_movies[["Name", "imdb"]])

print(watched_movies.dtypes)

Unnamed: 0,Name,imdb
0,Us,https://www.imdb.com/title/tt6857112/
1,Annabelle,https://www.imdb.com/title/tt3322940/
2,Up in the Air,https://www.imdb.com/title/tt1193138/
3,The Conjuring,https://www.imdb.com/title/tt1457767/
4,A Quiet Place,https://imdb.com/title/tt6644200/
...,...,...
104,Once Upon A Time in Hollywood,https://www.imdb.com/title/tt7131622/
105,The Girl With the Dragon Tattoo,https://www.imdb.com/title/tt1568346/
106,Nimona,https://www.imdb.com/title/tt19500164/
107,Spy Kids,https://www.imdb.com/title/tt0227538/


Name                   object
Status                 object
Completed              object
Collin Rating         float64
Valerie Rating        float64
Genre                  object
Source                 object
Previously Seen By     object
Chosen By              object
imdb                   object
Release Date           object
dtype: object


In [25]:
# helper func, get imdb id from link
def imdb_link_to_id(imdb_link: str) -> str:
    return imdb_link.split("/")[-2]

print(imdb_link_to_id("https://imdb.com/title/tt6644200/"))

tt6644200


In [26]:
watched_movies["imdb_id"] = watched_movies["imdb"].apply(imdb_link_to_id)

display(watched_movies[["Name", "imdb_id"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  watched_movies["imdb_id"] = watched_movies["imdb"].apply(imdb_link_to_id)


Unnamed: 0,Name,imdb_id
0,Us,tt6857112
1,Annabelle,tt3322940
2,Up in the Air,tt1193138
3,The Conjuring,tt1457767
4,A Quiet Place,tt6644200
...,...,...
104,Once Upon A Time in Hollywood,tt7131622
105,The Girl With the Dragon Tattoo,tt1568346
106,Nimona,tt19500164
107,Spy Kids,tt0227538


In [43]:
# get extra info from tmdb
def get_movie_info(imdb_id: str) -> Dict:
    found_data: Dict = tmdb.Find(imdb_id).info(external_source="imdb_id")["movie_results"][0]
    return {
        "tmdb_id": found_data.get("id", -1),
        "summary": found_data.get("overview", ""),
        "adult": found_data.get("adult", False),
        "genre_ids": found_data.get("genre_ids", []),
        "popularity": found_data.get("popularity"),
        "vote_average": found_data.get("vote_average"),
        "vote_count": found_data.get("vote_count"),
        "budget": found_data.get("budget"),
        "runtime": found_data.get("runtime")
    }

print(get_movie_info("tt3322940"))

{'tmdb_id': 250546, 'summary': 'A couple begins to experience terrifying supernatural occurrences involving a vintage doll shortly after their home is invaded by satanic cultists.', 'adult': False, 'genre_ids': [27], 'popularity': 64.794, 'vote_average': 5.77, 'vote_count': 6044, 'budget': None, 'runtime': None}


In [44]:
# enhance with extra info from tmdb
watched_movies["extra_info"] = watched_movies["imdb_id"].map(get_movie_info)

display(watched_movies[["Name", "extra_info"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  watched_movies["extra_info"] = watched_movies["imdb_id"].map(get_movie_info)


Unnamed: 0,Name,extra_info
0,Us,"{'tmdb_id': 458723, 'summary': 'Husband and wi..."
1,Annabelle,"{'tmdb_id': 250546, 'summary': 'A couple begin..."
2,Up in the Air,"{'tmdb_id': 22947, 'summary': 'Corporate downs..."
3,The Conjuring,"{'tmdb_id': 138843, 'summary': 'Paranormal inv..."
4,A Quiet Place,"{'tmdb_id': 447332, 'summary': 'A family is fo..."
...,...,...
104,Once Upon A Time in Hollywood,"{'tmdb_id': 466272, 'summary': 'Los Angeles, 1..."
105,The Girl With the Dragon Tattoo,"{'tmdb_id': 65754, 'summary': 'Disgraced journ..."
106,Nimona,"{'tmdb_id': 961323, 'summary': 'A knight frame..."
107,Spy Kids,"{'tmdb_id': 10054, 'summary': 'Carmen and Juni..."


In [45]:
# explode extra_info out into unique columns
def getColumn(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df[column_name] = df["extra_info"].map(lambda x: x[column_name])
    return df
watched_movies = getColumn(watched_movies, "tmdb_id")
watched_movies = getColumn(watched_movies, "summary")
watched_movies = getColumn(watched_movies, "adult")
watched_movies = getColumn(watched_movies, "genre_ids")
watched_movies = getColumn(watched_movies, "popularity")
watched_movies = getColumn(watched_movies, "vote_average")
watched_movies = getColumn(watched_movies, "vote_count")
watched_movies = getColumn(watched_movies, "budget")
watched_movies = getColumn(watched_movies, "runtime")

display(watched_movies[["Name", "tmdb_id", "summary", "adult", "genre_ids", "popularity", "vote_average", "vote_count", "budget", "runtime"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df["extra_info"].map(lambda x: x[column_name])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df["extra_info"].map(lambda x: x[column_name])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df["extra_info"].map(lambda x: x[column_name])


Unnamed: 0,Name,tmdb_id,summary,adult,genre_ids,popularity,vote_average,vote_count,budget,runtime
0,Us,458723,Husband and wife Gabe and Adelaide Wilson take...,False,"[27, 53]",35.044,7.000,7257,,
1,Annabelle,250546,A couple begins to experience terrifying super...,False,[27],64.794,5.770,6044,,
2,Up in the Air,22947,Corporate downsizing expert Ryan Bingham spend...,False,"[18, 10749]",19.648,6.900,3575,,
3,The Conjuring,138843,Paranormal investigators Ed and Lorraine Warre...,False,"[27, 53]",143.517,7.500,11437,,
4,A Quiet Place,447332,A family is forced to live in silence while hi...,False,"[27, 18, 878]",96.357,7.400,14058,,
...,...,...,...,...,...,...,...,...,...,...
104,Once Upon A Time in Hollywood,466272,"Los Angeles, 1969. TV star Rick Dalton, a stru...",False,"[35, 18, 53]",58.632,7.439,13319,,
105,The Girl With the Dragon Tattoo,65754,Disgraced journalist Mikael Blomkvist investig...,False,"[53, 80, 9648]",28.766,7.374,6909,,
106,Nimona,961323,A knight framed for a tragic crime teams with ...,False,"[12, 16, 14, 10751, 28, 878]",34.169,7.907,1061,,
107,Spy Kids,10054,Carmen and Juni think their parents are boring...,False,"[28, 35, 10751, 12]",27.216,5.796,3394,,


In [42]:
# write out modified data to use elsewhere
watched_movies.to_csv("../data/enhanced_movie_list.csv", index=False)