In [1]:
import pandas as pd
import os
import ast
import wikipediaapi
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

In [2]:
PATH_IN = 'dataset'

fname = os.path.join(PATH_IN, 'movie.metadata.tsv')
movie = pd.read_csv(fname, delimiter='\t', header=None)
movie.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages (Freebase ID:name tuples)', 'Movie countries (Freebase ID:name tuples)', 'Movie genres (Freebase ID:name tuples)']

fname = os.path.join(PATH_IN, 'character.metadata.tsv')
character = pd.read_csv(fname, delimiter='\t', header=None)
character.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie release date', 'Character name', 'Actor date of birth', ' Actor gender', 'Actor height (in meters)', 'Actor ethnicity (Freebase ID)', 'Actor name',
                     'Actor age at movie release', 'Freebase character/actor map ID', 'Freebase character ID', 'Freebase actor ID']

fname = os.path.join(PATH_IN, 'plot_summaries.txt')
plot_summaries = pd.read_csv(fname, delimiter='\t', header=None)
plot_summaries.columns = ['Wikipedia movie ID', 'Summary']


fname = os.path.join(PATH_IN, 'tvtropes.clusters.txt')
tvtropes = pd.read_csv(fname, delimiter='\t', header=None, names = ['Trope','StringDict'])


tvtropes['Dictionnary'] = tvtropes['StringDict'].apply(ast.literal_eval)
df = pd.json_normalize(tvtropes['Dictionnary'])
tvtropes['Character Name'] = df['char']
tvtropes['Movie name'] = df['movie']
tvtropes['Freebase movie ID'] = df['id']
tvtropes['Actor name'] = df['actor']
tvtropes = tvtropes.drop(tvtropes.columns[1], axis=1)
tvtropes = tvtropes.drop(tvtropes.columns[1], axis=1)



fname = os.path.join(PATH_IN, 'name.clusters.txt')
name_clusters = pd.read_csv(fname, delimiter='\t', header=None, names = ['Character Name','ID'])

In [85]:
movie["Movie release year"] = movie["Movie release date"].apply(lambda x: x if len(str(x)) == 4 else pd.to_datetime(x, errors='coerce').year)
movie["Movie release year"] = movie["Movie release year"].fillna(0).astype(int)
movie["Movie release year"]


0        2001
1        2000
2        1988
3        1987
4        1983
         ... 
81736    2011
81737    2011
81738    1972
81739    1992
81740    2002
Name: Movie release year, Length: 81741, dtype: int32

In [10]:
movie['Movie release date'].unique()

array(['2001-08-24', '2000-02-16', '1988', ..., '1927-11-22',
       '1972-09-22', '1992-05-21'], dtype=object)

In [12]:
plot_summaries

Unnamed: 0,Wikipedia movie ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [3]:
tvtropes

Unnamed: 0,Trope,Character Name,Movie name,Freebase movie ID,Actor name
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
...,...,...,...,...,...
496,young_gun,Morgan Earp,Tombstone,/m/0k776f,Bill Paxton
497,young_gun,Colorado Ryan,Rio Bravo,/m/0k2kqg,Ricky Nelson
498,young_gun,Tom Sawyer,The League of Extraordinary Gentlemen,/m/0k5nsh,Shane West
499,young_gun,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez


Generate the data frame containing the winning oscar films

In [19]:


movie_names = []

#For windows users 
#service = Service('C:\webdrivers\chromedriver.exe')
#options = webdriver.ChromeOptions()
#driver = webdriver.Chrome(service = service, options = options)

# for mac users
driver = webdriver.Chrome()

url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Cinematography"
driver.get(url)


time.sleep(5)


tr_tags = driver.find_elements(By.TAG_NAME, 'tr')

for tr_tag in tr_tags:
    style = tr_tag.get_attribute('style')
    
    
    if 'rgb(250, 235, 134)' in style:  
      
        td_tags = tr_tag.find_elements(By.TAG_NAME, 'td')
        
        if len(td_tags) > 1:
            name = td_tags[0].text.strip()
            movie_names.append(name)
            movie_info = td_tags[1].text.strip()


driver.quit()
movies_oscar_winning = pd.DataFrame(movie_names, columns=['Movie name'])

In [20]:
movies_oscar_winning

Unnamed: 0,Movie name
0,Sunrise: A Song of Two Humans
1,White Shadows in the South Seas
2,With Byrd at the South Pole
3,Tabu: A Story of the South Seas
4,Shanghai Express
...,...
120,1917
121,Mank
122,Dune
123,All Quiet on the Western Front


Generate the data frame containing the actors that won an oscar

In [5]:
oscar_winners = pd.DataFrame()
names_list = []
# Set up the WebDriver
# For Windows users 
#service = Service('C:\webdrivers\chromedriver.exe')
#options = webdriver.ChromeOptions()
#driver = webdriver.Chrome(service = service, options = options)  # Make sure to have ChromeDriver installed

# For Mac users
driver = webdriver.Chrome()

# Specify the target URL
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor"
driver.get(url)

# Adding a wait to ensure all content is loaded
time.sleep(5)

# Find all 'tr' tags in the table to extract year, movie, and symbol information
tr_tags = driver.find_elements(By.TAG_NAME, 'tr')

# Iterate over each 'tr' tag to find the required information
for tr_tag in tr_tags:
    # Extract 'td' elements from the row
    td_tags = tr_tag.find_elements(By.TAG_NAME, 'td')

    if len(td_tags) > 0:
        # Extract the text content and background color from each 'td' tag
        for td_tag in td_tags:
            text = td_tag.text.strip()
            style = td_tag.get_attribute('style')

            # Check for specific symbols in the text and categorize them
            if '‡' in text:
                cleaned_text = text.replace('‡', '').strip()
                first_two_words = ' '.join(cleaned_text.split()[:2])
                names_list.append(first_two_words)
            #Refused oscar
            elif '§' in text:
                cleaned_text = text.replace('§', '').strip()
            
            #Posthume oscar
            elif '†' in text:
                cleaned_text = text.replace('†', '').strip()
               

# Close the browser
driver.quit()
character['oscar'] = [actor in  names_list for actor in character['Actor name']]

In [30]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#We can then adapt it for the movies which got an award
movie_titles = movies_oscar_winning['Movie name'].unique()
real_movie_titles = []
movie_reviews = pd.DataFrame(columns=['Movie name', 'Review'])



# For Windows users 
#service = Service('C:\webdrivers\chromedriver.exe')
#options = webdriver.ChromeOptions()
#driver = webdriver.Chrome(service = service, options = options)

#For mac users
driver = webdriver.Chrome()

nb_reviews_per_movie = 1


movie_titles_dict = {}

for movie_title in movie_titles:
    url = f"https://www.rottentomatoes.com/search?search={movie_title.replace(' ', '_').lower()}"
    driver.get(url)
    time.sleep(5)
    link = WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.XPATH, "//a[@data-qa='info-name']"))  # Replace with your actual locator
    )
    link.click()
    time.sleep(5)
    # Wait until the rt-text element with the slot attribute is present
    rt_text_element = WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.XPATH, "//rt-text[@slot='title']"))
    )


    title_name = rt_text_element.text
    real_movie_titles.append(title_name)
    base_url = driver.current_url
    
    reviews_url = f"{base_url}/reviews?type=user"
    

    driver.get(reviews_url)
   
    time.sleep(1)  
    reviews = WebDriverWait(driver, 15).until(
        EC.presence_of_all_elements_located((By.XPATH, "//p[@class='audience-reviews__review js-review-text']"))
    )
    while True:
        try:
            # Locate and click the "Load More" button
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//rt-button[@data-qa='load-more-btn']"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
            load_more_button.click()

            # Wait for the new reviews to load
            time.sleep(3)  # Adjust if necessary based on your internet speed

        except:
            # If the "Load More" button is not found, break the loop
            print("No more 'Load More' button found, exiting loop.")
            break
    # Iterate over each review element and extract the text
    for review in reviews:
        review_text = review.text.strip()
        new_row = pd.DataFrame([{'Movie name': title_name, 'Review': review_text}])
        movie_reviews = pd.concat([movie_reviews, new_row], ignore_index=True)


driver.quit()


No more 'Load More' button found, exiting loop.


AttributeError: 'DataFrame' object has no attribute 'append'