Picking Up the dataframe with the watchlist

In [1]:
from pathlib import Path
import pandas as pd

watchlist_path = Path("data/watchlist.csv")
watchlist_df = pd.read_csv(watchlist_path)

Showing the dataframe

In [2]:
print(watchlist_df.to_string())


           Date                                             Name  Year        Letterboxd URI
0    2024-03-15                          The Wolf of Wall Street  2013  https://boxd.it/3C1m
1    2024-03-17                                    The Holdovers  2023  https://boxd.it/vHza
2    2024-06-10                            Bad Boys: Ride or Die  2024  https://boxd.it/l3Ki
3    2024-06-10                               Godzilla Minus One  2023  https://boxd.it/zu4c
4    2024-06-10                               Planet of the Apes  1968  https://boxd.it/29p8
5    2024-06-10                                        Inception  2010  https://boxd.it/1skk
6    2024-06-10                                      Oppenheimer  2023  https://boxd.it/wUow
7    2024-06-10                                      Glass Onion  2022  https://boxd.it/oCkK
8    2024-06-10                      Ghostbusters: Frozen Empire  2024  https://boxd.it/AxYu
9    2024-06-10                                 American Fiction  2023

Rename incompatible movie names

In [4]:
watchlist_df["Name"] = watchlist_df["Name"].str.replace("Fake Blonde","Falsa Loura",regex=False)
watchlist_df["Name"] = watchlist_df["Name"].str.replace("Horizon: An American Saga – Chapter 2", "Horizon: An American Saga - Chapter 2")
watchlist_df["Name"] = watchlist_df["Name"].str.replace("Rio, 100 Degrees F°", "Rio, 40 Degrees")
watchlist_df["Name"] = watchlist_df["Name"].str.replace("F1", "F1: The Movie")
watchlist_df["Name"] = watchlist_df["Name"].str.replace("The Blue Trail", "The Blue Trail")

watchlist_df.to_csv(watchlist_path, index=False)

Searching and put all extra info about the movies into df

In [5]:
import requests
import traceback

# URL base da API
BASE_URL = "http://www.omdbapi.com/"

if watchlist_df.empty:
    print("There are no new movies to process. Exiting.")

else:
    for i, row in watchlist_df.iterrows():
        params = {"apikey": "77055a01", "t": row["Name"]}
        try:
            response = requests.get(BASE_URL, params=params)
            data = response.json()

            print(f"Processando filme {i}: {row['Name']}")

            # Verificando se o título retornado pela API corresponde
            if "Title" in data and row["Name"].lower() == data["Title"].lower():
                watchlist_df.loc[i, "Genre"] = data.get("Genre", "")
                watchlist_df.loc[i, "Director"] = data.get("Director", "")
                watchlist_df.loc[i, "Writer"] = data.get("Writer", "")
                watchlist_df.loc[i, "Actors"] = data.get("Actors", "")
                watchlist_df.loc[i, "Language"] = data.get("Language", "")
                watchlist_df.loc[i, "Country"] = data.get("Country", "")
                watchlist_df.loc[i, "Poster"] = data.get("Poster", "")
                print(f"Informações adicionadas com sucesso para o filme: {row['Name']}\n")
            else:
                print(f"\n ******** Título não corresponde ou não encontrado: {row['Name']} != {data.get('Title', 'N/A')} ******* \n")
        except Exception as e:
            print(f"Erro no índice {i}: {e}")
            print("Traceback:")
            traceback.print_exc()


Processando filme 0: The Wolf of Wall Street
Informações adicionadas com sucesso para o filme: The Wolf of Wall Street

Processando filme 1: The Holdovers
Informações adicionadas com sucesso para o filme: The Holdovers

Processando filme 2: Bad Boys: Ride or Die
Informações adicionadas com sucesso para o filme: Bad Boys: Ride or Die

Processando filme 3: Godzilla Minus One
Informações adicionadas com sucesso para o filme: Godzilla Minus One

Processando filme 4: Planet of the Apes
Informações adicionadas com sucesso para o filme: Planet of the Apes

Processando filme 5: Inception
Informações adicionadas com sucesso para o filme: Inception

Processando filme 6: Oppenheimer
Informações adicionadas com sucesso para o filme: Oppenheimer

Processando filme 7: Glass Onion
Informações adicionadas com sucesso para o filme: Glass Onion

Processando filme 8: Ghostbusters: Frozen Empire
Informações adicionadas com sucesso para o filme: Ghostbusters: Frozen Empire

Processando filme 9: American Fi

Saving the df

In [6]:
watchlist_df.to_csv(watchlist_path, index=False)


Separate decades presents in watchlist

In [7]:
def separate_movie_by_decade(newest_df):
    decades = {}
    min_year = newest_df["Year"].min()
    max_year = newest_df["Year"].max()
    aux_min_year = min_year

    for i in range(min_year, max_year + 1):
        if i % 10 == 0:
            aux_max_year = i
            decades[f"{aux_min_year}-{aux_max_year}"] = [None]
            aux_min_year = i + 1

    decades[f"{aux_min_year}-{max_year}"] = [None]

    return decades


In [8]:
from collections import Counter

def separate_topics(watchlist_df, topic):

    list_topics = []
    aux = []
    for item in watchlist_df[topic]:
        if pd.notna(item):
            list_topics.append(item.split(", "))

    for p in list_topics:
        for word in p:
            aux.append(word)

    return dict(Counter(aux).most_common(5))

In [9]:
decades = separate_movie_by_decade(watchlist_df)

for key in decades.keys():
    ini = key[:4]
    end = key[5:]
    aux_list = []

    for i in range(int(ini) , int(end) + 1):
        n = watchlist_df[watchlist_df["Year"] == i]["Name"]
        for info in n:
            aux_list.append(info)
    decades[key] = aux_list

for key, value in decades.items():
    print(f"\n{key}: {value}\n")
    print("="*130)


1941-1950: ['Citizen Kane', 'Sunset Boulevard']


1951-1960: ['Rio, 40 Degrees', '12 Angry Men', 'Vertigo', 'Psycho', 'The Apartment']


1961-1970: ['The Great Escape', 'The Good, the Bad and the Ugly', 'Persona', 'Planet of the Apes', '2001: A Space Odyssey', 'Once Upon a Time in the West', 'The Red Light Bandit', 'Deep End']


1971-1980: ['The Godfather', 'The Godfather Part II', 'The Texas Chain Saw Massacre', "One Flew Over the Cuckoo's Nest", 'Dog Day Afternoon', 'The Omen', 'Taxi Driver', 'Carrie', 'The Deer Hunter', 'Halloween', 'Apocalypse Now', 'Rocky II', 'Stalker', 'The Shining', 'Inferno', 'Friday the 13th']


1981-1990: ['Blow Out', 'Thief', 'Blade Runner', 'Rocky III', 'Christine', 'Once Upon a Time in America', 'A Nightmare on Elm Street', 'Back to the Future', 'After Hours', 'RoboCop', 'Die Hard', 'Big', 'They Live', 'Back to the Future Part II', 'The Godfather Part III', 'Back to the Future Part III']


1991-2000: ['The Silence of the Lambs', 'Cape Fear', 'Reservoir D

Number of movies by decade

In [10]:
number_decade = {}

for key in decades.keys():
    number_decade[key] = len(decades[key])

for key, value in number_decade.items():
    print(f"{key}: {value} movies")

1941-1950: 2 movies
1951-1960: 5 movies
1961-1970: 8 movies
1971-1980: 16 movies
1981-1990: 16 movies
1991-2000: 26 movies
2001-2010: 19 movies
2011-2020: 35 movies
2021-2025: 63 movies


Separating movies of a especific decade

In [11]:
def films_by_decade(dict, year):
    year += 1
    for key in dict.keys():
        if key.startswith(str(year)):
            return dict[key]

    return f"Invalid year or movies not found for {year} decade"

Choosing a film randomly using decade

In [17]:
import random
print(f"The movie is => {random.choice(films_by_decade(decades, 1940))}")

The movie is => Citizen Kane


Top 5 directors in watchlist

In [18]:
#print(watchlist_df["Director"].value_counts().head(5).to_string())
for director, value in separate_topics(watchlist_df, "Director").items():
    print(f"{director} -> {value} movies")

Martin Scorsese -> 8 movies
Michael Bay -> 6 movies
Quentin Tarantino -> 5 movies
Ridley Scott -> 4 movies
Francis Ford Coppola -> 4 movies


Top 5 years in watchlist

In [19]:
print(watchlist_df["Year"].value_counts().head(5).to_string())

Year
2024    22
2023    13
2025    12
2022    10
2018     7


In [20]:
for genre, value in separate_topics(watchlist_df, "Genre").items():
    print(f"{genre} -> {value} movies")

Drama -> 121 movies
Crime -> 54 movies
Comedy -> 44 movies
Action -> 43 movies
Thriller -> 39 movies


Top 5 actors in watchlist

In [21]:
for actor, value in separate_topics(watchlist_df, "Actors").items():
    print(f"{actor} -> {value} movies")

Robert De Niro -> 8 movies
Leonardo DiCaprio -> 7 movies
Tom Hanks -> 6 movies
Sylvester Stallone -> 4 movies
Keanu Reeves -> 4 movies


Top 5 Countries in watchlist

In [22]:
for country, value in separate_topics(watchlist_df, "Country").items():
    print(f"{country} -> {value} movies")

United States -> 169 movies
United Kingdom -> 31 movies
Canada -> 15 movies
France -> 15 movies
China -> 7 movies


Top 5 languages in watchlist

In [23]:
for language, value in separate_topics(watchlist_df, "Language").items():
    print(f"{language} -> {value} movies")

English -> 179 movies
Spanish -> 37 movies
French -> 32 movies
German -> 23 movies
Italian -> 21 movies
