In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Get number of films in a watchlist

In [26]:
def get_watchlist_count(username):
    page_url = f"https://letterboxd.com/{username}/watchlist/"
    response = requests.get(page_url)
    if response.status_code == 200:
        # get soup object
        soup = BeautifulSoup(response.text, 'html.parser')

        # get number of total films in watchlist
        watchlist_count_element = soup.find('span', class_='js-watchlist-count')
        if watchlist_count_element:
            watchlist_count_text = watchlist_count_element.text
            watchlist_count = int(watchlist_count_text.split()[0])
    return watchlist_count


### Get movie titles in a watchlist

In [27]:
# Function to extract movie titles from a Letterboxd watchlist URL
def get_movie_titles(username):

    movie_titles = []

    # get 1st page
    page_url = f"https://letterboxd.com/{username}/watchlist/"
    response = requests.get(page_url)
    if response.status_code == 200:
        # get soup object
        soup = BeautifulSoup(response.text, 'html.parser')

        # get movie titles on 1st page
        movies = soup.find_all('div', class_='film-poster')
        for movie in movies:
            # get movie title
            title = movie['data-film-slug']
            # append movie title to list
            movie_titles.append(title)

    # get movies on pages 2-end
    # 1st watchlist page uses the watchlist url... 
    # 2nd page and beyond uses the watchlist url + page number
    page = 2
    while True:
        page_url = f"https://letterboxd.com/{username}/watchlist/page/{page}/"
        response = requests.get(page_url)
        if response.status_code == 200:
            # get soup object
            soup = BeautifulSoup(response.text, 'html.parser')

            # get movie titles on this page
            movies = soup.find_all('div', class_='film-poster')
            if not movies:
                break  # no movies on this page
            
            # get movies on this page
            for movie in movies: 
                # get movie title
                title = movie['data-film-slug']
                # append movie title to list
                movie_titles.append(title)
            page += 1
        else:
            break  # Error or no more pages
    return movie_titles

### Get only movie titles in all selected watchlists

In [28]:
def get_common_movies_list(usernames):
    all_movies_list = []

    for user in usernames:
        count = get_watchlist_count(user)
        watchlist = get_movie_titles(user)
        # make sure correct # in watchlist
        assert count == len(watchlist), f"ERORR: something is wrong about number of items in watchlist - {count} != {len(watchlist)}"
        all_movies_list.append(watchlist)

    
    common_movies = set(all_movies_list[0])
    for movies_list in all_movies_list[1:]:
        common_movies = common_movies.intersection(movies_list)
    return list(common_movies)



### Get data for each movie in a shared watchlist

In [29]:
def get_movie_stats(data_film_slug):
    # Construct the movie URL using the film-id
    movie_url = f"https://letterboxd.com/film/{data_film_slug}/"

    # Send a GET request to the movie URL
    response = requests.get(movie_url)

    if response.status_code == 200:
        # Parse the HTML content of the movie page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the relevant meta tags
        director_tag = soup.find('meta', {'name': 'twitter:data1'})
        rating_tag = soup.find('meta', {'name': 'twitter:data2'})
        og_title_tag = soup.find('meta', {'property': 'og:title'})
        duration_tag = soup.find('p', class_='text-link text-footer')

        # Extract the content from the meta tags
        director = director_tag.get('content') if director_tag else None
        rating = float(rating_tag.get('content').split(' ')[0]) if rating_tag else None
        title = og_title_tag.get('content').split('(')[0] if og_title_tag else None
        year = og_title_tag.get('content').split('(')[-1].strip(')') if og_title_tag else None
        duration = duration_tag.text.strip().split()[0]

    else:
        print("Error: Failed to retrieve movie stats.")
    return director, rating, title, year, duration

### Make a df with data for all shared movies

In [30]:
def get_common_movies_df(common_movies_list):
    common_movies_df = pd.DataFrame(columns=['Title', 'Rating', 'Director', 'Year', 'Duration (mins)'])
    # Iterate through the common_movies list
    for movie in common_movies_list:
        # Get the movie stats using the get_movie_stats function
        director, rating, title, year, duration = get_movie_stats(movie)
        
        # Append the movie stats to the DataFrame
        common_movies_df = pd.concat([common_movies_df, pd.DataFrame([[title, rating, director, year, duration]], columns=['Title', 'Rating', 'Director', 'Year', 'Duration (mins)'])])
    # Print the DataFrame
    common_movies_df.reset_index()
    common_movies_df_sorted = common_movies_df.sort_values(by='Rating', ascending=False)
    common_movies_df_sorted.reset_index(drop=True, inplace=True)
    return common_movies_df_sorted


### Get df

In [31]:
num_users = input("Enter the number of users you want to compare: ")
users = []
for i in range(int(num_users)):
    user = input(f"Enter username{i+1}: ")
    users.append(user)

In [32]:
common_movies = get_common_movies_list(users)
df = get_common_movies_df(common_movies)

usernames = "_".join(users)
filename = f"{usernames}_common_movies.csv"
df.to_csv(filename)
df

  common_movies_df = pd.concat([common_movies_df, pd.DataFrame([[title, rating, director, year, duration]], columns=['Title', 'Rating', 'Director', 'Year', 'Duration (mins)'])])


Unnamed: 0,Title,Rating,Director,Year,Duration (mins)
0,Stalker,4.43,Andrei Tarkovsky,1979,162
1,Memories of Murder,4.41,Bong Joon-ho,2003,131
2,In the Mood for Love,4.41,Wong Kar-wai,2000,99
3,The Pianist,4.36,Roman Polanski,2002,150
4,Poor Things,4.3,Yorgos Lanthimos,2023,141
5,Aftersun,4.25,Charlotte Wells,2022,102
6,The King of Comedy,4.24,Martin Scorsese,1982,109
7,Dead Poets Society,4.23,Peter Weir,1989,128
8,My Neighbor Totoro,4.19,Hayao Miyazaki,1988,86
9,"Synecdoche, New York",4.19,Charlie Kaufman,2008,124
