# Grace Techau
## Box Office Revenue & Letterboxd Ratings Project 
### Scraping Letterboxd Website 2017 Movies

**Scraping elements title, year, number_ratings, average_rating, length and genres for top 25% most popular Letterboxd movies in 2017 applying the filter 'Hide short films'.** 

In [None]:
# import all required packages 
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time 
import random

In [None]:
# define random scroll function 
def random_scroll(browser, total_wait_time): 
    total_height = browser.execute_script("return document.body.scrollHeight")
    scroll_steps = random.randint(3,10)
    scroll_increment = total_height // scroll_steps
    time_per_step = total_wait_time / scroll_steps
    for step in range(scroll_steps): 
        browser.execute_script(f"window.scrollBy(0, {scroll_increment});")
        random_wait = random.uniform(0.5 * time_per_step, 1.5 * time_per_step)
        time.sleep(random_wait)
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [None]:
# initialize the Selenium web driver (using Chrome)
chrome_options = Options()
service = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service=service, options=chrome_options)

### YEAR 2017 - Scrape URL links to individual movie detail pages 

Create a function for applying the viewing filter 'Hide short films' to each page when scraping the individual movie page URL's from the main Letterboxd movie website.  

In [None]:
def apply_filters(): 
    try: 
        WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "section.smenu-wrapper .smenu label"))
        )
            
        filter_button = WebDriverWait(browser, 20).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "section.smenu-wrapper .smenu label"))
        )
        filter_button.click()
    
        time.sleep(random.uniform(1,3))
            
        #Apply the "Hide short films" filter
        hide_short_films_button = WebDriverWait(browser,20).until(
            EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'Hide short films')]")) 
        )
        hide_short_films_button.click()
        print("Clicked 'Hide short films' filter")

        time.sleep(random.uniform(4,15))
    
    except Exception as e: 
        print(f"Error applying filters: {e}")

Create a function to scrape the individual movie page URL's from the main Letterboxd movie website for movies from 2017 sorted by popularity.  

In [None]:
def scrape_movie_links(): 
    urls_2017 = [] 

     #scrape all the a tags with the class 'frame'
    tags = browser.find_elements(By.XPATH, '//a[@class="frame"]')

    #seperate the attribute 'href' from all tags - contains the URL to the individual Letterboxd movie detail pages 
    for tag in tags: 
        href = tag.get_attribute('href')
        if href:
            urls_2017.append(href)

    return urls_2017

Create a function for scraping multiple pages of the main Letterboxd website for films in 2017 sorted by popularity. For each page, the apply_filters and scrape_movie_links functions will run. \
\
For the year 2017 there are 269 pages of movies with the 'Hide short films' filter applied. I scraped only the top quarter of these page (66), to capture the 25% most popular movies. The 66 pages were scraped in four batches. The different batches were collected into seperate CSV files which are detailed at the bottom of this page. \
\
The four CSV files will be merged during cleaning to capture the total of the 25% most popular movies from 2017. 

In [None]:
def scrape_movie_pages(start_page, end_page): 
    urls_2017 = [] 

    for i in range(start_page, end_page +1): 
        url_2017 = f"https://letterboxd.com/films/popular/year/2017/page/{i}/"

        browser.get(url_2017) 
        browser.maximize_window()

        print(f"Scraping page {i}: {url_2017}")

        time.sleep(random.uniform(3,5))

        # Only apply the filter to the first page being scraped - the filter is applied to all pages after 
        if i == 55: 
            apply_filters()

        film_urls = scrape_movie_links()
        urls_2017.extend(film_urls)

        total_wait_time = random.uniform(5, 12)
        random_scroll(browser, total_wait_time)

        print(f"Finished scraping page {i}.")

    return urls_2017

## top 25% most popular pages: 66 pages 
### raw_1 - pages 1 to 37.5
### raw_2 - pages 37.5 to 42.5 
### raw_3 - pages 42.5 to 55.5 
### raw_4 - pages 55.5  t0 66

start_page = 55
end_page = 66
urls_2017 = scrape_movie_pages(start_page, end_page)

print("-"*70)
print("Totals of URLS scraped for 2017")
print("-"*70)
print(f"Total # URLs scraped: {len(urls_2017)}")

Modify the scraped URL's to include the browser extension '/genres/'.\
This allows all the correct genre data to be scraped from the individual Letterboxd movie detail pages.

In [None]:
modifed_urls = [url + 'genres/' for url in urls_2017]

### YEAR 2017 - Scrape movie data from each movie's page  

In [None]:
# create list to store the data title, year, average_rating, number_ratings, lenth, and genres for 
# each movie on Letterboxd in 2017
movie_data = []

for url in modifed_urls: 
    browser.get(url)
    browser.maximize_window()
    
    total_wait_time = random.uniform(5, 12)
    random_scroll(browser, total_wait_time)

    
    try:
        #SCRAPE TITLE
        title_element = browser.find_element(By.CSS_SELECTOR,"h1.headline-1.filmtitle span.name.js-widont.prettify")
        titles = title_element.text.strip()

        #SCRAPE YEAR 
        year_element = browser.find_element(By.CSS_SELECTOR, "div.releaseyear a")
        years = year_element.text.strip()
        
        #SCRAPE AVERAGE RATING AND NUMBER OF RATINGS 
        try:
            average_rating_element = browser.find_element(By.CSS_SELECTOR, "span.average-rating a.tooltip.display-rating ")
            average_ratings = average_rating_element.text.strip()
            number_ratings = average_rating_element.get_attribute('data-original-title')
        except NoSuchElementException: 
            average_ratings = "No average rating available"
            number_ratings = "No number of ratings available"
            
        #SCRAPE LENGTHS 
        lengths = browser.find_element(By.CSS_SELECTOR, "p.text-link.text-footer").text

        #SCRAPE GENRES 
        try: 
            genre_elements = browser.find_elements(By.CSS_SELECTOR, "div.text-sluglist.capitalize a.text-slug")
            if genre_elements:
                genres = [genre.text.strip() for genre in genre_elements]
            else: 
                genres = ['No genres available']
        except NoSuchElementException:
            genres = ['No genres available']

        #Apend all of the movie data to the dictionary movie_data
        movie_data.append({
            'title': titles,
            'year' : years, 
            'number_ratings' : number_ratings, 
            'average_rating' : average_ratings, 
            'length' : lengths, 
            'genres' : ", ".join(genres),
        })

    except Exception as e: 
        print(f"Error scraping {url}: {e}")
        movie_data.append({
            'title': None,
            'year' : None, 
            'number_ratings' : None, 
            'average_rating' : None, 
            'length' : None, 
            'genres' : None
        })

    #keep a tracker to know when each URL has been scraped 
    print(f"Finished scraping {url}")
    
#close the browser 
browser.close()

### YEAR 2017 - Create a pandas data frame 'movie_data_2017'

In [None]:
movie_data_2017 = pd.DataFrame(movie_data)

display(movie_data_2017)

### Save dataframe to a CSV file for cleaning 
Break down of pages covered in the different files for scraping year 2017. 

| Pages        | File Name                               |
|--------------|-----------------------------------------|
| 1 - 37       | letterboxd_movie_data_2017_raw_1.csv    |
| 37 - 42      | letterboxd_movie_data_2017_raw_2.csv    |
| 42 - 55      | letterboxd_movie_data_2017_raw_3.csv    |
| 55 - 66      | letterboxd_movie_data_2017_raw_4.csv    |


In [None]:
movie_data_2017.to_csv("letterboxd_movie_data_2017_raw_4.csv", header=True, index=False, encoding='utf-8')