In [69]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import time

class LetterboxdScraper(BeautifulSoup):
    def __init__(self, user, headers, base_url="https://letterboxd.com"):
        self.user = user
        self.headers = headers
        self.base_url = base_url
        self.names = []
        self.links = []
        self.rating = []

    def get_total_pages(self):
        """Obtém o número total de páginas de filmes do usuário."""
        response = requests.get(f"{self.base_url}/{self.user}/films/page/1", headers=self.headers)
        super().__init__(response.text, 'html.parser')

        # Verifica se a página não foi encontrada
        if self.title and self.title.string == "Letterboxd - Not Found":
            raise ValueError("Usuário não encontrado no Letterboxd.")

        paginate = self.find_all('li', {'class': 'paginate-page'})
        if paginate:
            # Obtém o número da última página
            string = str(paginate[-1])
            match = re.search(r'>(\d+)<', string)
            return int(match.group(1)) if match else 1
        return 1

    def scrape_page(self, page_number):
        """Faz o scrape de uma página específica e armazena os dados de filmes."""
        response = requests.get(f"{self.base_url}/{self.user}/films/page/{page_number}", headers=self.headers)
        super().__init__(response.text, 'html.parser')

        # Verifica se a página não foi encontrada
        if self.title and self.title.string == "Letterboxd - Not Found":
            raise ValueError(f"Página {page_number} não encontrada para o usuário {self.user}.")
        
        for item in self.find_all('li', {"class": "poster-container"}):
            # Extraindo o nome do filme
            name_match = re.search(r'img alt="(.*?)"', str(item))
            # Extraindo o link do filme
            link_match = re.search(r'data-target-link="(.*?)"', str(item))
            # Extraindo nota do filme
            rating_span = item.find('span', class_='rating')

            if rating_span:
                # Contar quantas estrelas existem no texto
                num_stars = rating_span.text.count('★') + rating_span.text.count('½')/2
                self.rating.append(num_stars)
            else:
                self.rating.append(None)

            if name_match and link_match:
                self.names.append(name_match.group(1))
                self.links.append(f"{self.base_url}{link_match.group(1)}")

    def scrape_all_pages(self):
        """Faz o scrape de todas as páginas de filmes do usuário usando múltiplas threads."""
        total_pages = self.get_total_pages()

        def process_page(page):
            self.scrape_page(page)

        with ThreadPoolExecutor(max_workers=10) as executor:
            list(executor.map(process_page, range(1, total_pages + 1)))

    def to_dataframe(self):
        """Retorna os dados coletados como um DataFrame do pandas."""
        return pd.DataFrame({
            "Name": self.names,
            "Ratings": self.rating,
            "Link": self.links
        })

    
# Uso da classe
if __name__ == "__main__":

    user = "kizardas"
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    start = time.time() 
    scraper = LetterboxdScraper(user=user, headers=headers)
    scraper.scrape_all_pages()
    df = scraper.to_dataframe()
    
    # Opcional: salvar os dados em um arquivo CSV
    df.to_csv(f"data/{user}_data.csv", index=False)  # Salva os dados em um arquivo CSV
    end = time.time() 
    print(end-start)

3.1755125522613525


In [73]:
from concurrent.futures import ThreadPoolExecutor
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

# Definindo os cabeçalhos para a requisição
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}

dta = pd.read_csv("data\exorgravity_data.csv")

def fetch_data(row):
    film_id, url = row
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        site = BeautifulSoup(response.text, 'html.parser')
        response2 =  requests.get(url+"fans/", headers=HEADERS, timeout=10)
        site2 = BeautifulSoup(response2.text, 'html.parser')

        if 'Not Found' in str(site.title):
            return film_id, None, None, None, None

        # Extract data
        try:
            meta_string = str(site.find('meta', {'name': 'twitter:data2'}))
            avr_rating = re.search(r'[0-9]\.[0-9][0-9]', meta_string).group(0)
        except:
            avr_rating = None
        
        try:
            year = re.findall(r">([0-9]+)<", str(site.findAll('div', {"class":"releaseyear"})[1]))[0]
        except:
            year = None
        
        try:
            duration = re.search(r'[0-9]+', str(site.find('p', {'class': 'text-link text-footer'}))).group(0)
        except:
            duration = None

        try:
            genre_section = site.find('div', {'class': 'text-sluglist capitalize'})
            genre = [x.text for x in genre_section.find_all('a')]
        except:
            genre = []

        try:
            lang = site.select_one('a[href^="/films/language"]').text.strip()
        except:
            lang = None

        try:
            members = re.findall(r'title="([0-9,]+)',str(site2.findAll("li", {"class":"js-route-watches"})[0]))[0]
        except:
            members = None
        
        try:
            fans = re.findall(r'title="([0-9,]+)',str(site2.findAll("li", {"class":"js-route-fans"})[0]))[0]
        except:
            fans = None

        try:
            likes = re.findall(r'title="([0-9,]+)',str(site2.findAll("li", {"class":"js-route-likes"})[0]))[0]
        except:
            likes = None

        try:
            lists = re.findall(r'title="([0-9,]+)',str(site2.findAll("li", {"class":"js-route-lists"})[0]))[0]
        except:
            lists = None

        try:
            reviews = re.findall(r'title="([0-9,]+)',str(site2.findAll("li", {"class":"js-route-reviews"})[0]))[0]
        except:
            reviews = None

        return film_id, avr_rating, year, duration, genre, lang, members, fans, likes, lists, reviews

    except requests.RequestException:
        return film_id, None, None, None, None, None, None, None, None, None, None, None


# Run the function in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(fetch_data, dta.values), total=len(dta)))

# Process results into DataFrame
columns = ['Name', 'Avr_Rating', 'Year', 'Duration', 'Genre', 'Language', 'Members', 'Fans', 'Likes', 'Lists', 'Reviews']
raw = pd.DataFrame(results, columns=columns)

print('Search completed!')


100%|██████████| 12/12 [00:11<00:00,  1.02it/s]

Search completed!





In [None]:
raw

Unnamed: 0,Name,Avr_Rating,Year,Duration,Genre,Language,Members,Fans,Likes,Lists,Reviews
0,Mufasa: The Lion King,2.9,2024,118,"[Family, Adventure, Animation]",English,238700,293,51426,35336,88050
1,Smile 2,3.27,2024,127,"[Mystery, Horror]",English,596171,1604,158432,106604,210972
2,Speak No Evil,3.27,2024,110,"[Horror, Thriller]",English,357356,254,82489,68209,116675
3,I'm Still Here,4.36,2024,137,"[Drama, History]",Portuguese,287222,8448,178666,66373,125246
4,The Substance,3.82,2024,141,"[Horror, Science Fiction]",English,2114043,21961,740536,354296,794337
5,Close,4.11,2022,104,[Drama],French,384220,11023,160143,91458,111279
6,Incantation,3.21,2022,111,[Horror],Chinese,178124,1267,37697,31794,43286
7,"Ó Paí, Ó: Look at This",3.78,2007,92,"[Comedy, Music]",Portuguese,24235,271,7088,5533,4434
8,Carandiru,4.1,2003,145,[Drama],Portuguese,79572,1483,29783,16727,18410
9,How to Lose a Guy in 10 Days,3.69,2003,116,"[Comedy, Romance]",English,1498701,38527,488530,190648,229903
