In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [68]:
def scrape_imdb_reviews(movie_id):
    print('scrapping for id ' + str(movie_id))
    #URL for the movie's reviews page on IMDb
    url = f"https://www.imdb.com/title/{movie_id}/reviews"

    driver = webdriver.Chrome()
    driver.get(url)
    #to wait for the page to load
    time.sleep(2) 

    
    try:
        buttons = driver.find_elements(By.CLASS_NAME, "ipc-btn")
        
        #Initialize a counter for occurrences of "Tout" bc there are 2 buttons "Tout" in general.
        tout_counter = 0
        for button in buttons:
            if button.text.strip() == "Tout":
                tout_counter += 1
            if tout_counter == 2:
                print('activating button Tout')
                driver.execute_script("arguments[0].scrollIntoView();", button)  #Scroll to the button to ensure it's visible
                time.sleep(2)
                button.click()
                break

        #to ensure the reviews load
        time.sleep(10)


        #get the spoilers      
        print("spoilers")
        #page source after interaction
        new_buttons = driver.find_elements(By.CLASS_NAME, "ipc-btn")

        #script to get all the spoilers quickly
        js_script = """
                const buttons = Array.from(document.querySelectorAll('span.ipc-btn__text'));
                buttons.forEach(button => {
                if (button.textContent.trim() === 'Spoiler') {
                    button.click();
                }
            });
        """
        driver.execute_script(js_script)
        
        
        html_from_page = driver.page_source

    
        soup = BeautifulSoup(html_from_page, 'html.parser')

        review_containers = soup.find_all('div', class_='ipc-list-card__content')
        reviews = []
        stars = []
        for container in review_containers:
            review_div = container.find('div', class_='ipc-html-content-inner-div')
            review_text = review_div.get_text(strip=True) if review_div else "No review available"
            #review_text = container.find('div', class_='ipc-html-content-inner-div').get_text(strip=True)

            star_rating_tag = container.find('span', class_='ipc-rating-star--rating')
            if star_rating_tag:
                star_rating = star_rating_tag.get_text(strip=True)
            else:
                star_rating = None

            reviews.append(review_text)
            stars.append(star_rating)

    finally:
        driver.quit()

    return reviews, stars

In [4]:
import pandas as pd
import os

#current working directory
current_dir = os.getcwd()

project_root = os.path.dirname(os.path.dirname(current_dir))
data_dir = os.path.join(project_root, 'data')

csv_file_y = os.path.join(data_dir, 'merged_data.csv')
csv_file_g = os.path.join(data_dir, 'IMDBReviews1.csv')

df_y = pd.read_csv(csv_file_y)
df_g = pd.read_csv(csv_file_g)

print(df_y.head())
print(df_g.head())


         0     tconst               2
0   975900  tt0228333    ghostsofmars
1  9363483  tt0094320   whiteoftheeye
2   261236  tt0083949  awomaninflames
3  6631279  tt0119548      littlecity
4   171005  tt0097499          henryv
   wikipedia_ID            name  release_year  rating  \
0        975900  Ghosts of Mars          2001     6.0   
1        975900  Ghosts of Mars          2001     1.0   
2        975900  Ghosts of Mars          2001     7.0   
3        975900  Ghosts of Mars          2001     7.0   
4        975900  Ghosts of Mars          2001     5.0   

                                       review_detail  
0  Oh no - not more zombies? Oh-yes; there's no e...  
1  Lost my v card to this one. And that's pretty ...  
2  I avoided seeing this film for the longest tim...  
3  Ghosts of Mars: Saw it again last night. I've ...  
4  The special effects for the de-limbing and dec...  


In [50]:
first_column_y = df_y.columns[0]
first_column_g = df_g.columns[0]

#only movies not found by Guillaume
df_filtered = df_y[~df_y[first_column_y].isin(df_g[first_column_g])]

print(df_filtered.head())
print(df_filtered.shape)

           0      tconst                     2
1    9363483   tt0094320         whiteoftheeye
3    6631279   tt0119548            littlecity
6   32456683   tt0061637  diefahnevonkriwojrog
8   20604092  tt13667080            anbuthozhi
10  11448183   tt0367546       bindiyachamkegi
(5524, 3)


In [70]:
#getting reviews for first 500 movies then putting that in a CSV file

expanded_rows = []

for index, row in df_filtered.head(500).iterrows():
    tconst = row['tconst']
    reviews, stars = scrape_imdb_reviews(tconst)

    for review, star in zip(reviews, stars):
        new_row = row.tolist() + [review, star]
        expanded_rows.append(new_row)

# Create a new DataFrame with expanded data
expanded_df_first = pd.DataFrame(expanded_rows, columns=list(df.columns) + ['review', 'star'])

# Display the expanded DataFrame
print(expanded_df_first.head())
expanded_df_first.to_csv('expanded_df_first.csv', index=False)

scrapping for idtt0094320
spoilers
scrapping for idtt0119548
spoilers
scrapping for idtt0061637
spoilers
scrapping for idtt13667080
spoilers
scrapping for idtt0367546
spoilers
scrapping for idtt0163238
spoilers
scrapping for idtt0323365
spoilers
scrapping for idtt1792799
spoilers
scrapping for idtt0479697
spoilers
scrapping for idtt1516552
spoilers
scrapping for idtt0799954
je vais activer le bouton
spoilers
scrapping for idtt14338824
spoilers
scrapping for idtt1715210
spoilers
scrapping for idtt0249131
spoilers
scrapping for idtt0025198
spoilers
scrapping for idtt0088461
spoilers
scrapping for idtt2378334
spoilers
scrapping for idtt0461472
spoilers
scrapping for idtt0152930
je vais activer le bouton
spoilers
scrapping for idtt1400528
spoilers
scrapping for idtt0261167
spoilers
scrapping for idtt0091922
spoilers
scrapping for idtt0317354
spoilers
scrapping for idtt1786668
spoilers
scrapping for idtt0125994
spoilers
scrapping for idtt0291891
spoilers
scrapping for idtt0068774
spoilers
s