# Letterboxd Scraping

In [1]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

## Generate Username list

This function generates a list of the users that follow the chosen user. It scrapes up to x number of pages of usernames from Letterboxd. 

In [None]:
def users_list(username, pages_to_scrape):
    '''Returns a list of users that follow the given username, input the number
    of following pages you wish to scrape as an int.'''
    start = time.time()

    # First page of usernames
    html_page = requests.get('https://letterboxd.com/'+ username + '/followers/page/1') 
    soup = BeautifulSoup(html_page.content, 'html.parser')

    # Create list of usernames
    users = [movie.find('a', class_='avatar -a40')['href'] 
             for movie in soup.find_all('div', class_='person-summary')]

    # Add pages 2-pages_to_scrape to list of usernames
    for x in range(2, pages_to_scrape):
        x = str(x)
        html_page = requests.get('https://letterboxd.com/'+ username + '/followers/page/' + x ) 
        soup = BeautifulSoup(html_page.content, 'html.parser')
    
        users = users + [movie.find('a', class_='avatar -a40')['href']
                          for movie in soup.find_all('div', class_='person-summary')]
    
    
    # Add original username to list
    users.append(username)

    # Strip the '/' from the usernames
    users = [elem.strip('/') for elem in users]
    print(f'number of usernames: {len(users)}')

    # Print run time
    end = time.time()
    print(f'time to run: {round((end-start), 2)}')
    return users

My chosen user is 'fuchsiadyke' due to the fact that they are followed by over 16,000 users ensuring that we will get the 100 full pages of usernames. 

In [None]:
# Run the above function to get the list of usernames
users = users_list('fuchsiadyke', 100)

In [None]:
# A quick look at the first 10 values in the list
users[0:10]

## Generate Rating DataFrame 

The following function takes in a usernames gathers the all the rating information from the movies they have marked as seen on Letterboxd.

In [None]:
def user_rating_3_col(username):
    '''Scrapes user ratings from Letterboxd.com and creates a dataframe with
    three columns: film_id, username, and rating'''

    # Scrape the first page of films watched by username
    html_page = requests.get('https://letterboxd.com/'+ username + '/films/page/' + '1' )
    soup = BeautifulSoup(html_page.content, 'html.parser')
    # Find the last page number of films watched by usename

    try: 
        pages = soup.find('div', class_="paginate-pages")
        last_page = pages.find_all('li', class_='paginate-page')
        last_page = last_page[-1].text
    except:
        last_page = 1
    

    # Create dictionary of film_id, film_name, and link
    movies = [{'film_id': movie.find('div', class_='film-poster')['data-film-id']} 
              for movie in soup.find_all('li', class_='poster-container')]

    # Create Dataframe with film_id, film_name, link
    df_temp1 = pd.DataFrame(movies)

    
    film_rating = []
    user_rating = []

    # Create list of star rating
    for x in soup.find_all('li', class_='poster-container'):
        try: 
            film_rating.append(x.find('span', class_='rating').text)
        except: 
            film_rating.append(None)

    # Change star rating to number rating        
    for x in film_rating:
        try:
            if x[-1]=='½':
                user_rating.append(len(x)-0.5)
            else:
                user_rating.append(len(x))
        except:
            user_rating.append(None)
            
    # Add user_rating to the df
    df_temp1['rating'] = user_rating
    df_temp1['username'] = username
    
    # Scrape remaining pages and add to Dataframe
    lp = int(last_page)

    for y in range(2, lp+1):
        z = str(y)
    
        html_page2 = requests.get('https://letterboxd.com/'+ username + '/films/page/' + z )
        soup2 = BeautifulSoup(html_page2.content, 'html.parser')
        
        # Create dictionary of film_id, film_name, and link
        movies2 = [{'film_id': movie.find('div', class_='film-poster')['data-film-id']} 
                  for movie in soup2.find_all('li', class_='poster-container')]
        df_temp = pd.DataFrame(movies2)
    
        film_rating2 = []
        user_rating2 = []
        
        # Create list of star rating
        for x in soup2.find_all('li', class_='poster-container'):
            try: 
                film_rating2.append(x.find('span', class_='rating').text)
            except: 
                film_rating2.append(None)

        # Change star rating to number rating        
        for x in film_rating2:
            try:
                if x[-1]=='½':
                    user_rating2.append(len(x)-0.5)
                else:
                    user_rating2.append(len(x))
            except:
                user_rating2.append(None)
            
        # add user_rating to the df
        df_temp['rating'] = user_rating2
        df_temp['username'] = username
        
        
        df_temp1 = df_temp1.append(df_temp)
        
    return df_temp1
        

In [None]:
# Create a list of test_users to ensure the function works before running on 
# all usernames

test_users = ['ingloriousbasta', 'eldodo', 'tldr_com', 'yangforyin', 'iutub',
              'willmsfilms', 'rockz', 'cae_des', 'ca2ba2', 'nischal170', 'javidog', 
              'irokill', 'ithacuss', 'ngcaihui42', 'mrireilly', 'danyalahmed',
              'bortex', 'travishenderson', 'manuelcouto', 'sunsetsofie', 'nataliedc',
              'badsioop', 'thewatchmakers', 'tashk', 'redgravehepburn', 'sadfrog23',
              'sargy7', 'kna1223', 'janvite', 'privateidahos', 'mirels', 'dovegirl',
              'adamcbest', 'sergioaudelo', 'muzwot', 'ethanjame', 'tylerharris',
              'bugix', 'stephensboyer', 'kamikazegirls', 'alexisthegay', 'waster', 
              'seymacetin', 'arent', 'panizzz', 'phillitj', 'midnightnostalg', 
              'rkrespin', 'spikeydlux', 'sbernstein9', 'cryptidpeep', 'lenilinden',
              'jomes', 'groenbaek', 'elisabethmcl', 'jasonmcghan', 'lostasterisk', 
              'fridge_lp', 'chiarahp', 'jmitchell67', 'dpen42', 'dubsdeedubs', 
              'piaescobar', 'jgaffney', 'jayhayes05', 'beatrixralph1', 
              'mickeemouser', '12monkeys']

In [None]:
# A for loop that runs through the list of usernames and the previous function
# to create a dataframe of their ratings
start = time.time()

df = pd.DataFrame(columns=['film_id'])

for user in users:
    try:
        temp = user_rating_3_col(user)
        df = df.append(temp)
    except:
        pass

end = time.time()
    
print(f'This took {round((end - start), 2)} seconds to run')

In [None]:
# Quick look to make sure it work as intended
df.info()

In [None]:
df.tail()

In [None]:
# Rearrange the columns into how we will use them in the models.
df = df[['username', 'film_id', 'rating']]

In [None]:
# Save df as a .csv file
df.to_csv(r'panda_dataframes/user_rating_3col.csv', index = False)

In [None]:
df.head()

## Create DataFrame for Film Information

In [3]:
def film_names(last_page):
    """Scrapes film names from letterboxd.com. It takes the last_page to
    be scraped as int. It returns a dataframe with the film name, letterboxd link,
    and letterboxd id"""
    
    start = time.time()
    html_page = requests.get('https://letterboxd.com/films/ajax/popular/size/small/page/1/')
    soup = BeautifulSoup(html_page.content, 'html.parser')

    movies = [{'film_name': movie.find('a', class_='frame')['title'], 
               'lb_link': 'https://letterboxd.com' + movie.find('a', class_='frame')['href'],
               'lb_id': movie.find('div')['data-film-id']}
              for movie in soup.find_all('li', class_='listitem poster-container')]

    df_film = pd.DataFrame(movies)


    # Add pages 2-3000 to df_film
    for x in range(2, last_page):
        x = str(x)
        html_page = requests.get('https://letterboxd.com/films/ajax/popular/size/small/page/' + x)
        soup = BeautifulSoup(html_page.content, 'html.parser')
    
        movies = [{'film_name': movie.find('a', class_='frame')['title'], 
                   'lb_link': 'https://letterboxd.com' + movie.find('a', class_='frame')['href'],
                   'lb_id': movie.find('div')['data-film-id']}
                  for movie in soup.find_all('li', class_='listitem poster-container')]

    
        df_temp = pd.DataFrame(movies)
        df_film = df_film.append(df_temp)
    
    end = time.time()
    print(f'This took {round((end - start), 2)} seconds to run')
    
    return df_film

In [7]:
df_film = film_names(3000)

This took 2.03 seconds to run


In [8]:
df_film['lb_id'].nunique()

144

In [33]:
def add_dir_tmdb(df):
    """Takes the film_names df and scrapes each individual film page to return 
    a data frame with director, tmdb id, and whether it is a movie or tv show."""

    start = time.time()
    
    directors =[]
    tmdb_ids = []
    movie_or_tv = []

    for x in range(0, len(df_film)):
        html = df_film.iloc[x]['lb_link']
        html_page = requests.get(html)
        film = BeautifulSoup(html_page.content, 'html.parser')
        temp = film.find('a', {'data-track-action': 'TMDb'})['href'].split('/')
        movie_tv = temp[-3]
        tmdb = temp[-2]
        temp2 = film.find('div', class_='tabbed-content-block column-block')
        temp2 = temp2.find('a', class_='text-slug').text
        directors.append(temp2)
        tmdb_ids.append(tmdb)
        movie_or_tv.append(movie_tv)
    
    end = time.time()
    
    df_film['director'] = directors
    df_film['tmdb_id'] = tmdb_ids
    df_film['movie_tv'] = movie_or_tv
    
    print(f'This took {round((end - start), 2)} seconds to run')

    return df_film
    # Took 53036.17 seconds to run

In [34]:
df_film = add_dir_tmdb(df_film)
df_film

This took 97.88 seconds to run


Unnamed: 0,film_name,lb_id,lb_link,tmdb_id,movie_tv,director
0,Parasite (2019),426406,https://letterboxd.com/film/parasite-2019/,496243,movie,Bong Joon-ho
1,Joker (2019),406775,https://letterboxd.com/film/joker-2019/,475557,movie,Todd Phillips
2,Knives Out (2019),475370,https://letterboxd.com/film/knives-out-2019/,546554,movie,Rian Johnson
3,Pulp Fiction (1994),51444,https://letterboxd.com/film/pulp-fiction/,680,movie,Quentin Tarantino
4,Inception (2010),34722,https://letterboxd.com/film/inception/,27205,movie,Christopher Nolan
5,Get Out (2017),353117,https://letterboxd.com/film/get-out-2017/,419430,movie,Jordan Peele
6,Lady Bird (2017),326279,https://letterboxd.com/film/lady-bird/,391713,movie,Greta Gerwig
7,Once Upon a Time… in Hollywood (2019),397859,https://letterboxd.com/film/once-upon-a-time-i...,466272,movie,Quentin Tarantino
8,Spider-Man: Into the Spider-Verse (2018),251943,https://letterboxd.com/film/spider-man-into-th...,324857,movie,Rodney Rothman
9,Midsommar (2019),459564,https://letterboxd.com/film/midsommar/,530385,movie,Ari Aster


In [None]:
# Add year to dataframe

film_ = list(df_film['film_name'])
film_year = [film_[0][-5:-1]]

for x in range(1, len(film_)):
    film_year.append(film_[x][-5:-1])

df_film['release_year'] = film_year    

In [None]:
# Strip year from title

df_film['film_name'] = df_film['film_name'].str[:-7]
df_film.head()

In [None]:
# Rearrange the columns
df_film = df_film[['film_name', 'lb_id', 'lb_link', 'tmdb_id', 'movie_tv', 'Year', 'Director']]

In [None]:
# save df_film to computer
df_film.to_csv(r'panda_dataframes/letterboxd_film_data_director.csv', index = False)

## Create Individual User DataFrames

In [37]:
def make_user_df(username):
    start = time.time()
    
    # Scrape the first page of films watched by username
    html_page = requests.get('https://letterboxd.com/'+ username + '/films/page/' + '1' )
    soup = BeautifulSoup(html_page.content, 'html.parser')
    
    # Find the last page number of films watched by usename
    pages = soup.find('div', class_="paginate-pages")
    last_page = pages.find_all('li', class_='paginate-page')
    last_page = last_page[-1].text
    
    # Create dictionary of film_id, film_name, and link
    movies = [{'film_id': movie.find('div', class_='film-poster')['data-film-id'],
          'film_name': movie.find('img', class_='image')['alt'], 
          'link': "https://letterboxd.com" + movie.find('div', class_= 'film-poster')['data-target-link']} 
              for movie in soup.find_all('li', class_='poster-container')]
    
    # Create Dataframe with film_id, film_name, link
    df = pd.DataFrame(movies)
    
    film_rating = []
    user_rating = []

    # Create list of star rating
    for x in soup.find_all('li', class_='poster-container'):
        try: 
            film_rating.append(x.find('span', class_='rating').text)
        except: 
            film_rating.append(None)

    # Change star rating to number rating        
    for x in film_rating:
        try:
            if x[-1]=='½':
                user_rating.append(len(x)-0.5)
            else:
                user_rating.append(len(x))
        except:
            user_rating.append(None)
            
    # Add user_rating to the df
    df['user_rating'] = user_rating
    
    # Scrape remaining pages and add to Dataframe
    lp = int(last_page)
    for y in range(2, lp+1):
        z = str(y)

        html_page2 = requests.get('https://letterboxd.com/'+ username + '/films/page/' + z )
        soup2 = BeautifulSoup(html_page2.content, 'html.parser')
        
        # Create dictionary of film_id, film_name, and link
        movies2 = [{'film_id': movie.find('div', class_='film-poster')['data-film-id'],
              'film_name': movie.find('img', class_='image')['alt'], 
              'link': "https://letterboxd.com" + movie.find('div', class_= 'film-poster')['data-target-link']} 
                  for movie in soup2.find_all('li', class_='poster-container')]
        df_temp = pd.DataFrame(movies2)
    
        film_rating2 = []
        user_rating2 = []
        
        # Create list of star rating
        for x in soup2.find_all('li', class_='poster-container'):
            try: 
                film_rating2.append(x.find('span', class_='rating').text)
            except: 
                film_rating2.append(None)

        # Change star rating to number rating        
        for x in film_rating2:
            try:
                if x[-1]=='½':
                    user_rating2.append(len(x)-0.5)
                else:
                    user_rating2.append(len(x))
            except:
                user_rating2.append(None)
            
        # add user_rating to the df
        df_temp['user_rating'] = user_rating2
        
        
        df = df.append(df_temp, ignore_index=True)
    
    end = time.time()
    
    print(f'This took {round((end - start), 2)} seconds to run')
    
    return df

In [38]:
# Test for username = creepergnome
creepergnome = make_user_df('creepergnome')
creepergnome.head()

This took 36.14 seconds to run


Unnamed: 0,film_id,film_name,link,user_rating
0,415620,Coming 2 America,https://letterboxd.com/film/coming-2-america/,3.0
1,494480,The United States vs. Billie Holiday,https://letterboxd.com/film/the-united-states-...,3.0
2,515466,Tom & Jerry,https://letterboxd.com/film/tom-jerry-2021/,2.0
3,511342,Judas and the Black Messiah,https://letterboxd.com/film/judas-and-the-blac...,4.0
4,671813,WandaVision,https://letterboxd.com/film/wandavision/,4.0


In [None]:
creepergnome.loc[creepergnome['film_name']=='Parasite']

In [None]:
creepergnome.head()

In [None]:
creepergnome['film_id'].nunique()

In [None]:
creepergnome.info()

In [None]:
# # Test for username = rockthrowingman
# rockthrowingman = make_user_df('rockthrowingman')
# rockthrowingman.head()

In [None]:
# rockthrowingman.tail()

In [None]:
# rockthrowingman.info()

In [None]:
# 1000 or so users
# 

# tables => username/ID
# movie info
# Ratings

# Create SQL Table

In [None]:
# Open dataframes
df_rating = pd.read_csv('panda_dataframes/user_rating_3col.csv')
df_film = pd.read_csv('panda_dataframes/letterboxd_film_data_director.csv')

In [None]:
# Double check the columns of both data frames
df_film.columns

In [None]:
df_rating.columns

In [None]:
# Inport and create an engine 
from sqlalchemy import create_engine

engine = create_engine('sqlite:///letterboxd.db', echo=True) 

In [None]:
# Create a function that takes the dataframe to make a table in our database
def create_sql_table(df, table_name, engine):
    df.to_sql(table_name, con=engine, if_exists='replace')

In [None]:
# Add tables to database
create_sql_table(df_rating, 'ratings', engine)

In [None]:
# Add taable to database
create_sql_table(df_film, 'films', engine)