In [1]:
import numpy as np 
import pandas as pd 
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait,Select
from selenium.common.exceptions import NoSuchElementException
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import requests
import subprocess
import re
from tqdm import tqdm
tqdm.pandas()

import os
os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

from logger import logger

In [2]:
pwd

'c:\\Users\\Harish Raju\\Desktop\\projects\\IMDB'

In [None]:
import time
import re
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm
import os

class IMDbReviewScraper:
    """A scraper to extract user reviews from multiple IMDb movie pages."""

    def __init__(self, movie_urls,movies_dict, progress_file="web_scrapper/data/progress.json", output_file="web_scrapper/data/imdb_reviews.csv"):
        """
        Initializes the IMDbReviewScraper.

        Args:
            movie_urls (list): List of IMDb movie URLs to scrape.
            progress_file (str): Path to the progress log file.
            output_file (str): Path to the output CSV file.
        """
        self.movie_urls = movie_urls
        self.movies_dict = movies_dict
        self.progress_file = progress_file
        self.output_file = output_file
        self.driver = webdriver.Chrome()
        self.all_reviews = []
        self.completed_urls = self.load_progress()

    def load_progress(self):
        """
        Loads progress from the progress file.

        Returns:
            set: A set of completed movie URLs.
        """
        if os.path.exists(self.progress_file):
            with open(self.progress_file, "r") as file:
                return set(json.load(file))
        return set()

    def save_progress(self, url):
        """
        Saves the completed movie URL to the progress file.

        Args:
            url (str): The URL of the completed movie.
        """
        self.completed_urls.add(url)
        with open(self.progress_file, "w") as file:
            json.dump(list(self.completed_urls), file)

    def append_to_csv(self, reviews):
        """
        Appends scraped reviews to the output CSV file.

        Args:
            reviews (list): List of review dictionaries.
        """
        df = pd.DataFrame(reviews)
        df.to_csv(self.output_file, mode='a', header=not os.path.exists(self.output_file), index=False)

    def get_reviews_page(self, url):
        """Navigates to the IMDb movie reviews page."""
        self.driver.get(url)
        try:
            # Wait until the reviews header is present and clickable
            click_reviews = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@data-testid='reviews-header']"))
            )
            return click_reviews.find_element(By.TAG_NAME, 'a').get_attribute('href')

        except NoSuchElementException:
            logger.info(f"Reviews link not found for {url}")
            return None

        except Exception as e:
            logger.error(f"An error occurred: {e}")
            return None

    def get_total_reviews_count(self):
        """Extracts the total number of reviews available for the movie."""
        try:
            reviews_element = self.driver.find_element(By.XPATH, "//div[@data-testid='tturv-total-reviews']")
            return int(re.sub(r"[^\d]", "", reviews_element.text))
        except NoSuchElementException:
            logger.info("Could not retrieve total reviews count.")
            return 0
    
    def click_show_all_button(self):
        """
        Clicks the 'All' button inside the pagination div to load all reviews.
        If the 'All' button is not clickable, clicks the 'more' button instead.
        """
        try:
            pagination_div = self.driver.find_element(By.XPATH, "//div[@data-testid='tturv-pagination']")
            
            # Attempt to find and click the 'All' button
            try:
                all_button = pagination_div.find_element(By.XPATH, ".//span[contains(@class, 'ipc-see-more')]/button[.//span[text()='All']]")
                if all_button.is_displayed() and all_button.is_enabled():
                    logger.info("Clicking 'All' button")
                    ActionChains(self.driver).move_to_element(all_button).click().perform()
                    return
            except Exception:
                # logger.warning("'All' button not found or not clickable")
                pass
            
            # Attempt to find and click the 'more' button if 'All' is not available
            try:
                # Wait for the pagination div to be present
                pagination_div = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, "//div[@data-testid='tturv-pagination']"))
                )

                # Locate the 'more' button inside the pagination div
                more_button = pagination_div.find_element(By.XPATH, ".//button[.//span[contains(text(), ' more')]]")

                # Ensure the button is visible and clickable
                if more_button.is_displayed() and more_button.is_enabled():
                    # logger.info("Clicking 'more' button")
                    
                    # Use ActionChains to ensure smooth interaction
                    ActionChains(self.driver).move_to_element(more_button).click().perform()
                    return
            except Exception as e:
                # logger.warning(f"'More' button not found or not clickable: {str(e)}")
                pass
            
            # logger.info("No clickable 'All' or 'more' button found, skipping action")
        
        except Exception as e:
            # logger.error(f"Error clicking pagination buttons: {e}")
            pass

    def load_all_reviews(self, reviews_count, max_wait_time=1200, polling_interval=1.5):
        """Loads all available reviews by scrolling and clicking 'Show All'."""
        start_time = time.time()
        self.click_show_all_button()
        stable_count = 0
        last_count = 0
        
        while True:
            articles = self.driver.find_elements(By.TAG_NAME, "article")
            # logger.info(f"extracted - {len(articles)} review articles")
            
            if len(articles) >= reviews_count or time.time() - start_time > max_wait_time:
                break
            
            if len(articles) == last_count:
                stable_count += 1
            else:
                stable_count = 0
            
            if stable_count >= 5:
                logger.info("Article count has remained the same for 5 consecutive checks. Stopping load.")
                break
            
            last_count = len(articles)
            
            if articles:
                self.driver.execute_script("arguments[0].scrollIntoView();", articles[-1])
            
            time.sleep(polling_interval)
            self.click_show_all_button()
        
        return articles
    
    def extract_review_details(self, articles, movie_url):
        """Extracts review details from the loaded review articles."""
        extracted_reviews = []
        for article in tqdm(articles,desc='extracting review details'):
            try:
                review_star = article.find_element(By.CLASS_NAME, 'ipc-rating-star--rating').text
            except NoSuchElementException:
                review_star = None
            try:
                max_star = article.find_element(By.CLASS_NAME, 'ipc-rating-star--maxRating').text[1:]
            except NoSuchElementException:
                max_star = None
            try:
                review_title = article.find_element(By.CLASS_NAME, 'ipc-title__text').text
            except NoSuchElementException:
                review_title = None
            try:
                review_text = article.find_element(By.CLASS_NAME, "ipc-overflowText--children").text
            except NoSuchElementException:
                review_text = None
            extracted_reviews.append({
                "movie_url": movie_url,
                "review_star": review_star,
                "max_star": max_star,
                "review_title": review_title,
                "review_text": review_text
            })
        return extracted_reviews

    def scrape_all_movies(self):
        """Scrapes reviews for all movies in the movie_urls list."""
        remaining_urls = [url for url in self.movie_urls if url not in self.completed_urls]
        la = 0
        for idx, url in enumerate(remaining_urls, start=1):
            try:
                logger.info(f"Starting - {idx} Scraping reviews for: {self.movies_dict[url]}")
                reviews_page = self.get_reviews_page(url)
                if not reviews_page:
                    self.save_progress(url)
                    continue
                self.driver.get(reviews_page)
                time.sleep(3)
                reviews_count = self.get_total_reviews_count()
                logger.info(f"total reviews - {reviews_count}")
                if reviews_count == 0:
                    self.save_progress(url)
                    continue
                articles = self.load_all_reviews(reviews_count)
                reviews_data = self.extract_review_details(articles, url)
                self.append_to_csv(reviews_data)
                self.save_progress(url)
                la+=1
                logger.info(f" {la} movie/movies successfully extracted")
                if idx % 10000 == 0:
                    user_input = input("Processed 10000 movies. Do you want to continue? (yes/no): ")
                    if user_input.lower() != 'yes':
                        logger.info("Stopping as per user request.")
                        break
            except:
                logger.error(f"error for - {idx} Scraping reviews for: {self.movies_dict[url]}")
        logger.info("Scraping completed!")

    def close_driver(self):
        """Closes the WebDriver."""
        self.driver.quit()

In [7]:
# from time import time

In [8]:
import pandas as pd

In [9]:
movies_index = pd.read_csv(r"C:\Users\Harish Raju\Desktop\projects\IMDB\web_scrapper\data\Movies_index_final.csv")
movie_urls = movies_index['IMDb Link'].to_list() #[movies_index['category']=='3.1_6']

In [10]:
movies_index.head()

Unnamed: 0,Title,IMDb Link,category
0,Radhe,https://www.imdb.com/title/tt10888594/?ref_=sr...,1_3
1,Adipurush,https://www.imdb.com/title/tt12915716/?ref_=sr...,1_3
2,Meet the Spartans,https://www.imdb.com/title/tt1073498/?ref_=sr_i_3,1_3
3,Epic Movie,https://www.imdb.com/title/tt0799949/?ref_=sr_i_4,1_3
4,Sadak 2,https://www.imdb.com/title/tt7886848/?ref_=sr_i_5,1_3


In [11]:
len(movie_urls)

28100

In [13]:
movies_index['category'].value_counts()

category
3.1_6     18772
6.1_10     6840
1_3        2488
Name: count, dtype: int64

In [14]:
movies_dict = dict(zip(movies_index['IMDb Link'],movies_index['Title']))

In [None]:
scraper = IMDbReviewScraper(movie_urls,movies_dict)
scraper.scrape_all_movies()
scraper.close_driver()

2025-04-05 12:22:14 - [INFO] - (813897698.py:305) - Starting - 1 Scraping reviews for: Scare Campaign
2025-04-05 12:22:25 - [INFO] - (813897698.py:313) - total reviews - 31
extracting review details: 100%|██████████| 31/31 [00:02<00:00, 13.93it/s]
2025-04-05 12:22:29 - [INFO] - (813897698.py:322) -  1 movie/movies successfully extracted
2025-04-05 12:22:29 - [INFO] - (813897698.py:305) - Starting - 2 Scraping reviews for: Jazbaa
2025-04-05 12:22:38 - [INFO] - (813897698.py:313) - total reviews - 35
extracting review details: 100%|██████████| 35/35 [00:02<00:00, 12.77it/s]
2025-04-05 12:22:42 - [INFO] - (813897698.py:322) -  2 movie/movies successfully extracted
2025-04-05 12:22:42 - [INFO] - (813897698.py:305) - Starting - 3 Scraping reviews for: The People That Time Forgot
2025-04-05 12:22:50 - [INFO] - (813897698.py:313) - total reviews - 63
2025-04-05 12:22:50 - [INFO] - (813897698.py:203) - Clicking 'All' button
extracting review details: 100%|██████████| 63/63 [00:04<00:00, 13.02i

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'C:\Users\Harish Raju\Desktop\projects\IMDB\imdb_reviews.csv')

In [3]:
df

Unnamed: 0,movie_url,review_star,max_star,review_title,review_text
0,https://www.imdb.com/title/tt0028597/?ref_=sr_...,8.0,10.0,"You're A Lucky Fellow, Mr. Smith",Cary Grant and Irene Dunne catch each other in...
1,https://www.imdb.com/title/tt0028597/?ref_=sr_...,8.0,10.0,Classic 'screwball comedy',"Jerry and Lucy, a mutually distrustful couple ..."
2,https://www.imdb.com/title/tt0028597/?ref_=sr_...,8.0,10.0,a delight,Irene Dunne and Cary Grant were wonderful toge...
3,https://www.imdb.com/title/tt0028597/?ref_=sr_...,10.0,10.0,We're In On the Joke,This movie is exquisitely directed and acted. ...
4,https://www.imdb.com/title/tt0028597/?ref_=sr_...,10.0,10.0,A masterpiece of brilliant anarchy,"Nothing in this movie makes sense, and it real..."
...,...,...,...,...,...
1567891,https://www.imdb.com/title/tt1065106/?ref_=sr_...,7.0,10.0,Pretty funny. Nice ending,I thought this film was pretty funny. If you'r...
1567892,https://www.imdb.com/title/tt1065106/?ref_=sr_...,1.0,10.0,This is by far the worst movie ever.,"Again, let me repeat it. This is by far the wo..."
1567893,https://www.imdb.com/title/tt1065106/?ref_=sr_...,10.0,10.0,What fun!,"Oh, I expected this to be complete rubbish but..."
1567894,https://www.imdb.com/title/tt1065106/?ref_=sr_...,7.0,10.0,Low budget flick with heart and good characters,"I saw this movie years ago, and i gave it a 8 ..."


In [6]:
df['review_text'].isna().sum()

np.int64(304941)

In [7]:
df[(df['review_star'].notna()) & (df['review_text'].notna())]['review_star'].apply(lambda x: 'pos' if x > 7 else ('neutral' if 4 <= x <= 7 else 'neg')).value_counts()

review_star
pos        440272
neutral    373359
neg        349537
Name: count, dtype: int64

In [8]:
df['movie_url'].value_counts()

movie_url
https://www.imdb.com/title/tt10838180/?ref_=sr_i_17      5169
https://www.imdb.com/title/tt10350626/?ref_=sr_i_1086    4752
https://www.imdb.com/title/tt10350922/?ref_=sr_i_16      4735
https://www.imdb.com/title/tt10230426/?ref_=sr_i_1842    4418
https://www.imdb.com/title/tt10309902/?ref_=sr_i_6407    3374
                                                         ... 
https://www.imdb.com/title/tt14675334/?ref_=sr_i_2448       1
https://www.imdb.com/title/tt1171709/?ref_=sr_i_3227        1
https://www.imdb.com/title/tt0496595/?ref_=sr_i_1337        1
https://www.imdb.com/title/tt9315418/?ref_=sr_i_5869        1
https://www.imdb.com/title/tt0384265/?ref_=sr_i_1563        1
Name: count, Length: 10869, dtype: int64

In [9]:
df['review_star'].value_counts()

review_star
10.0    254631
1.0     251803
7.0     154997
8.0     151166
6.0     128002
9.0     114509
5.0     110588
2.0      98767
3.0      95913
4.0      91093
Name: count, dtype: int64

In [10]:
df['review_star'].apply(lambda x:'pos' if x>7 else 'neg').value_counts()

review_star
neg    1034082
pos     520306
Name: count, dtype: int64

In [11]:
df['review_star'].apply(lambda x: 'pos' if x > 7 else ('neutral' if 4 <= x <= 7 else 'neg')).value_counts()

review_star
neg        549402
pos        520306
neutral    484680
Name: count, dtype: int64

In [10]:
# url = 'https://www.imdb.com/title/tt10888594/?ref_=sr_i_1'

In [11]:
# driver = webdriver.Chrome()

# reviews_link = url
# # got movie title and link

# driver.get(reviews_link)
# time.sleep(3)
# click_reviews = driver.find_element(By.XPATH, "//div[@data-testid='reviews-header']")
# reviews_link = click_reviews.find_element(By.TAG_NAME,'a').get_attribute('href')
# driver.get(reviews_link)
# reviews_element = driver.find_element(By.XPATH, "//div[@data-testid='tturv-total-reviews']")
# reviews_text = reviews_element.text
# # Use regex to extract only the numeric part
# reviews_count = int(re.sub(r"[^\d]", "", reviews_text))

# print(reviews_count)
# def click_all_button():
#     #  click button to show all reviews
#     try:
#         # Locate the correct <span> first, then find the button inside it
#         span_element = driver.find_element(By.XPATH, "//span[contains(@class, 'chained-see-more-button')]")
#         button = span_element.find_element(By.TAG_NAME, "button")

#         if button:
#             # Click the button using ActionChains (sometimes needed for better interaction)
#             ActionChains(driver).move_to_element(button).click().perform()
        
#             print("Correct 'All' button clicked successfully.")
#     except Exception as e:
#         print(f"Error: {e}")
# max_wait_time = 600  
# polling_interval = 1.5  # Check every 1.5 seconds
# target_count = reviews_count  # Expected number of articles

# start_time = time.time()

# click_all_button()

# while True:
#     # Count current number of loaded articles
#     articles = driver.find_elements(By.TAG_NAME, "article")
#     current_count = len(articles)

#     print(f"Loaded articles: {current_count}")

#     # If we reach the target, break the loop
#     if current_count >= target_count:
#         print("All articles loaded!")
#         break

#     # Stop waiting if max wait time is exceeded
#     if time.time() - start_time > max_wait_time:
#         print("Timeout reached. Extracting available articles.")
#         break
    

#     if articles:
#         last_article = articles[-1]  # Get the last article
#         driver.execute_script("arguments[0].scrollIntoView();", last_article)
#         # print("Scrolled to the last article.")
#     else:
#         # print("No articles found.")
#         pass
    
#     click_all_button()
    
#     time.sleep(polling_interval)  # Wait before checking again


In [12]:
# sample_articles = articles[:100]
# review_stars = []
# max_stars = []
# review_titles = []
# review_texts = []



# for article in tqdm(sample_articles):
    
#     review_star = None
#     try:
#         review_star =  article.find_element(By.CLASS_NAME,'ipc-rating-star--rating').text
#     except:
#         review_star = None
        
#     review_stars.append(review_star)
        
    
        
#     max_star = None
#     try:
#         max_star =  article.find_element(By.CLASS_NAME,'ipc-rating-star--maxRating').text[1:]
#     except:
#         max_star = None
#     max_stars.append(max_star)
        
#     review_title = None
#     try:
#         review_title =  article.find_element(By.CLASS_NAME,'ipc-title__text').text
#     except:
#         review_title = None
    
#     review_titles.append(review_title)
        

    
#     # review_text = None

#     # try:
#     #     # Check if Spoiler button exists
#     #     spoiler_button = article.find_element(By.XPATH, "//button[contains(@class, 'review-spoiler-button')]")
#     #     if spoiler_button:
#     #         review_text = None  # Set to None if spoiler button is found
#     #     else:
#     #         review_text = article.find_element(By.XPATH, "//div[@data-testid='review-overflow']").text
#     # except NoSuchElementException:
#     #     # If no spoiler button is found, try extracting review
#     #     try:
#     #         review_text = article.find_element(By.XPATH, "//div[@data-testid='review-overflow']").text
#     #     except NoSuchElementException:
#     #         review_text = None  # Set to None if review is also not found
    
    
#     review_text = None
#     try:
#         review_text =  article.find_element(By.CLASS_NAME,"ipc-overflowText--children").text
#     except:
#         review_text = None
            
#     review_texts.append(review_text)
        
    