In this notebook we will be taking a multitude of lists on Letterboxd (a movie list website) and parsing through the webpages to form a master list which does not contain repeats of any of the movies in these lists.

This list is with the aim of being used as a "check list" for when I will be downloading the films from Piratebay in order to keep track of which films have been downloaded.

In [79]:
import requests
import numpy as np
import pandas as pd

from time import sleep
from bs4 import BeautifulSoup

list_of_lists_path = r'/Users/fredericowieser/Documents/Web-Scraping-Letterboxd/letterboxd-film-lists.csv'
master_list_csv_path = r'/Users/fredericowieser/Documents/Web-Scraping-Letterboxd/MASTER-film-lists.csv'

In [36]:
df = pd.read_csv(list_of_lists_path)

In [10]:
print(df.head())

                                               links
0  https://letterboxd.com/purecinema1/list/brazil...
1  https://letterboxd.com/ivica_pusticki/list/fin...
2  https://letterboxd.com/ivica_pusticki/list/fin...
3  https://letterboxd.com/ivica_pusticki/list/fin...
4  https://letterboxd.com/ivica_pusticki/list/fin...


### Links Array

In [12]:
links = df['links'].tolist()

### Method for taking link and getting array of films

In [38]:
def get_movies(link : str) -> list[str]:
    """
    Looks at a letterboxd list link and returns a list type of the titles and release
    year of the films which are featured in the link.
    """
    # Get page HTML data
    page = requests.get(link)
    
    soup = BeautifulSoup(page.content, "html.parser")
    
    films_html = soup.find_all("li", class_="film-detail")
    
    films_html_len = len(films_html)
    
    films_from_page = [""]*films_html_len
    
    for i in range(films_html_len):
        title = films_html[i].find_all('a')[0].text
        year = films_html[i].find_all('a')[1].text
        
        films_from_page[i] = [title, year]
    
    sleep(5)
    
    return films_from_page

In [41]:
def get_movies_str(link : str) -> str:
    """
    Looks at a letterboxd list link and returns a list type of the titles and release
    year of the films which are featured in the link.
    """
    # Get page HTML data
    page = requests.get(link)
    
    soup = BeautifulSoup(page.content, "html.parser")
    
    films_html = soup.find_all("li", class_="film-detail")
    
    films_html_len = len(films_html)
    
    films_from_page = [""]*films_html_len
    
    for i in range(films_html_len):
        title = films_html[i].find_all('a')[0].text
        year = films_html[i].find_all('a')[1].text
        
        films_from_page[i] = f"{title}, {year}"
    
    sleep(5)
    
    return films_from_page

In [34]:
# Test
get_movies(links[0])

[['City of God', '2002'],
 ['Pixote', '1980'],
 ['Central Station', '1998'],
 ['Aquarius', '2016'],
 ['Bacurau', '2019'],
 ['Black God, White Devil', '1964'],
 ['Entranced Earth', '1967'],
 ['Neighboring Sounds', '2012'],
 ["A Dog's Will", '2000'],
 ['The Second Mother', '2015'],
 ['Limite', '1931'],
 ['Behind the Sun', '2001'],
 ['The Given Word', '1962'],
 ['Barren Lives', '1963'],
 ['São Paulo, S.A.', '1965'],
 ['Twenty Years Later', '1984'],
 ['Invisible Life', '2019'],
 ["At Midnight I'll Take Your Soul", '1964'],
 ['The Red Light Bandit', '1968'],
 ['Elite Squad', '2007'],
 ['Boy & the World', '2013'],
 ['Neon Bull', '2015'],
 ["They Don't Wear Black Tie", '1981'],
 ['Kiss of the Spider Woman', '1985'],
 ['Isle of Flowers', '1989'],
 ['Mango Yellow', '2002'],
 ['Playing', '2007'],
 ['Good Manners', '2017'],
 ['Macunaima', '1969'],
 ['Dona Flor and Her Two Husbands', '1976'],
 ['Lúcio Flávio, the Passenger of the Agony', '1977'],
 ['Four Days in September', '1997'],
 ['Madame Satã

### Making Master Movie List

The master movie list is a list of movies where we are going to take the lists from each webpage and create a list containing no duplicates.

In [37]:
def remove_duplicates(master_list : list[[str, str]], new_list : list[[str, str]]) -> list[[str, str]]:
    # Find elements that are in new_list but not in master_list
    new = set(new_list) - set(master_list)

    # Create the new list using list concatenation
    l = master_list + list(new)
    return l

In [43]:
test1 = get_movies_str(links[0])
test2 = get_movies_str(links[1])

In [44]:
master_list = remove_duplicates(test1, test2)

In [46]:
print(len(master_list))

195


In [51]:
def links_csv_2_master_list_letterboxd(links : list[str]) -> list[list[str, str]]:
    
    master_list = get_movies_str(links[0])
    
    for i in range(len(links) - 1):
        new_list = get_movies_str(links[i + 1])
        
        master_list = remove_duplicates(master_list, new_list)
    
    return master_list

In [61]:
master_list = links_csv_2_master_list_letterboxd(links)

In [62]:
print(len(master_list))

1159


In [63]:
master_list.sort()

print(master_list)

['(500) Days of Summer, 2009', '12 Angry Men, 1957', '12, 2007', '13th, 2016', '2001: A Space Odyssey, 1968', '20th Century Women, 2016', '3-Iron, 2004', '300, 2006', '4 Months, 3 Weeks and 2 Days, 2007', '8½, 1963', 'A Christmas Story, 1983', 'A Clockwork Orange, 1971', 'A Countess from Hong Kong, 1967', "A Dog's Will, 2000", 'A Fistful of Dollars, 1964', 'A Generation, 1955', 'A Girl Walks Home Alone at Night, 2014', 'A Guide to Recognizing Your Saints, 2006', "A Hard Day's Night, 1964", 'A Hard Day, 2014', 'A Heart in Winter, 1992', 'A Night to Remember, 1958', 'A Nightmare on Elm Street, 1984', 'A Page of Madness, 1926', 'A Pigeon Sat on a Branch Reflecting on Existence, 2014', 'A Serious Man, 2009', 'A Shot in the Dark, 1964', 'A Silent Voice, 2016', 'A Simple Plan, 1998', 'A Town Called Panic, 2009', 'Ace Ventura: Pet Detective, 1994', 'Ace in the Hole, 1951', 'Adaptation., 2002', 'Aelita: Queen of Mars, 1924', 'After Eight... Forever, 1987', 'Aguirre, the Wrath of God, 1972', 'A

In [80]:
master_list_new = [0]*len(master_list)

for i in range(len(master_list)):
    row_title = master_list[i][:-6]
    row_year = master_list[i][-4:]
    
    master_list_new[i] = [row_title, row_year]

In [77]:
row_year = master_list[0][-4:]
row_title = master_list[0][:-6]

In [78]:
print(row_year)
print(row_title)

2009
(500) Days of Summer


In [81]:
import csv

# open the file in the write mode
with open(master_list_csv_path, 'w') as f:
    # create the csv writer
    writer = csv.writer(f)
    
    for i in range(len(master_list_new)):
        # write a row to the csv file
        writer.writerow(master_list_new[i])