# Importing necessary libraries

In [1]:
import numpy as np 
import pandas as pd 
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait,Select
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import requests
import subprocess
import re
from tqdm import tqdm
tqdm.pandas()

import os
os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

In [2]:
from utils.web_scrapping import random_scroll,scroll_to_element,scroll_to_bottom
from logger import logger

# setting up selinium web driver and opening the webpage

In [3]:
logger.info(f"starting chrome using selenium")

2025-03-12 15:53:25 - [INFO] - (3841766469.py:1) - starting chrome using selenium


In [4]:
def launch_chrome_with_debugging(url):
    chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
    # user_data_dir = r"C:\\Users\\Harish raju\\AppData\\Local\\Google\\Chrome\\User Data\\Default"  # Adjust this path as needed
    remote_debugging_port = 9222

    # Command to start Chrome with remote debugging enabled
    command = [
        chrome_path,
        f"--remote-debugging-port={remote_debugging_port}",
        # f"--user-data-dir={user_data_dir}",
        url
    ]

    # Launch Chrome
    subprocess.Popen(command)

url = "https://www.imdb.com/search/title/?user_rating=6.1,10"
# Launch Chrome with the target URL
launch_chrome_with_debugging(url)

# Wait for a few seconds to ensure Chrome has started
time.sleep(5)


In [5]:
# Function to get the WebSocket Debugger URL
def get_websocket_debugger_url():
    response = requests.get('http://localhost:9222/json')
    if response.status_code == 200:
        json_data = response.json()
        for page in json_data:
            if "imdb" in page['url']:  # Match any Amazon page
                return page['webSocketDebuggerUrl']
    return None

# Check if Chrome is running with remote debugging
websocket_debugger_url = get_websocket_debugger_url()
if not websocket_debugger_url:
    raise Exception("Could not get the WebSocket debugger URL. "
                    "Make sure Chrome is running with remote debugging enabled.")

# Set up Selenium to use the existing Chrome instance
chrome_options = Options()
chrome_options.debugger_address = "localhost:9222"

# Set a custom user agent (optional)
user_agent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
              "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
chrome_options.add_argument(f"user-agent={user_agent}")

# Initialize WebDriver
driver = webdriver.Chrome(options=chrome_options)

# Extracting movies index - Title and link

In [6]:
# Constants
MAX_TITLES_BY_LOWER_BOUND = {
    1: 5000,
    3.1: 5000,
    6.1: 2500
}
SCROLLING_PERCENTAGE = 0.95
WAIT_TIME = 5  # Time to wait after scrolling or clicking

class movie_index_extractor:
    def __init__(self,driver):
        self.driver = driver
        self.base_url = "https://www.imdb.com/search/title/?user_rating={lower_bound},{upper_bound}"
    
    def sort_by_user_rating_count_desc(self):
        dropdown_label = self.driver.find_element(By.CLASS_NAME, "ipc-simple-select__input")
        dropdown_label.click()
        time.sleep(2)  # Wait for options to appear

        select_element = self.driver.find_element(By.ID, "adv-srch-sort-by")
        select = Select(select_element)
        select.select_by_value("USER_RATING_COUNT")  # Select "Number of Ratings"
        time.sleep(3)
        
        sort_button = self.driver.find_element(By.ID,'adv-srch-sort-order')
        sort_button.click()
        time.sleep(3)

    def scroll_to_element(self,element):
        """Scrolls smoothly to the given element."""
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", element)
        time.sleep(2)  # Small delay for UI adjustment
        
    def click_button(self):
        """Finds and clicks the 'See More' button, if available."""
        try:
            button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ipc-see-more__button')]"))
            )
            self.scroll_to_element(button)
            driver.execute_script("arguments[0].click();", button)  # JavaScript click to avoid interception issues
            logger.info("Clicked 'See More' button.")
            time.sleep(WAIT_TIME)  # Allow time for new content to load
        except Exception as e:
            logger.info("No more 'See More' button found or an error occurred:", e)
            return False  # Stop execution if button isn't found
        return True

    def collect_reviews(self):
        """Collects the review elements from the page."""
        return driver.find_elements(By.CLASS_NAME, "ipc-metadata-list-summary-item")
    
    def extract_movie_data(self, titles):
            """Extracts movie titles and links from collected elements."""
            movie_data = []

            for movie_element in tqdm(titles, desc="Extracting titles and links"):
                try:
                    raw_title = movie_element.find_element(By.XPATH, ".//h3[contains(@class, 'ipc-title__text')]").text
                    movie_title = re.sub(r"^\d+\.\s*", "", raw_title)  # Remove ranking numbers
                    movie_link = movie_element.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    movie_data.append((movie_title, movie_link))
                except Exception as e:
                    logger.error(f"Error extracting movie data: {e}")

            df_movies = pd.DataFrame(movie_data, columns=["Title", "IMDb Link"])
            df_movies.drop_duplicates(subset=['Title'], inplace=True)
            return df_movies
        
    def save_to_csv(self, df_movies, lower_bound, upper_bound):
        """Saves extracted movie data to CSV."""
        file_path = f"C:/Users/Harish Raju/Desktop/projects/IMDB/web_scrapper/data/Movies_index_rating_{lower_bound}_{upper_bound}.csv"
        df_movies.to_csv(file_path, index=False)
        logger.info(f"Saved {len(df_movies)} movies to {file_path}")
        
    def start_extraction(self):
            """Main function to start movie extraction for different rating bounds."""
            bounds = [
                (1, 3),
                (3.1, 6),
                (6.1, 10)
            ]

            for lower_bound, upper_bound in bounds:
                logger.info(f"Starting {lower_bound} to {upper_bound}")
                url = self.base_url.format(lower_bound=lower_bound, upper_bound=upper_bound)
                self.driver.get(url)
                self.sort_by_user_rating_count_desc()
                max_titles = MAX_TITLES_BY_LOWER_BOUND.get(lower_bound, 2000)  # Default to 2000 if not found
                logger.info(f"Extracting up to {max_titles} reviews from URL: {url}")

                while True:
                    if not self.click_button():  
                        logger.warning("No 'See More' button found, stopping extraction for this range.")
                        break

                    titles = self.collect_reviews()
                    logger.info(f"Total Reviews Collected: {len(titles)}")

                    if len(titles) >= max_titles:
                        logger.info("Reached max limit, stopping extraction.")
                        break
            
                logger.info('extracting movie data')
                df_movies = self.extract_movie_data(titles)
                
                self.save_to_csv(df_movies, lower_bound, upper_bound)


In [7]:
scraper = movie_index_extractor(driver)
scraper.start_extraction()

2025-03-12 15:53:34 - [INFO] - (537093880.py:85) - Starting 1 to 3
2025-03-12 15:53:47 - [INFO] - (537093880.py:90) - Extracting up to 5000 reviews from URL: https://www.imdb.com/search/title/?user_rating=1,3
2025-03-12 15:53:49 - [INFO] - (537093880.py:42) - Clicked 'See More' button.
2025-03-12 15:53:55 - [INFO] - (537093880.py:98) - Total Reviews Collected: 99
2025-03-12 15:53:57 - [INFO] - (537093880.py:42) - Clicked 'See More' button.
2025-03-12 15:54:02 - [INFO] - (537093880.py:98) - Total Reviews Collected: 149
2025-03-12 15:54:04 - [INFO] - (537093880.py:42) - Clicked 'See More' button.
2025-03-12 15:54:09 - [INFO] - (537093880.py:98) - Total Reviews Collected: 199
2025-03-12 15:54:11 - [INFO] - (537093880.py:42) - Clicked 'See More' button.
2025-03-12 15:54:16 - [INFO] - (537093880.py:98) - Total Reviews Collected: 249
2025-03-12 15:54:18 - [INFO] - (537093880.py:42) - Clicked 'See More' button.
2025-03-12 15:54:23 - [INFO] - (537093880.py:98) - Total Reviews Collected: 299
20

In [None]:

    # def start_extraction(self):
    #     bounds = [
    #         (1,3),
    #         (3.1,6),
    #         (6,10)
    #     ]
    #     for lower_bound,upper_bound in bounds:
    #         url = self.base_url.format(lower_bound = lower_bound,upper_bound = upper_bound)
    #         self.driver.get(url)
    #         max_titles = MAX_TITLES_BY_LOWER_BOUND[lower_bound]
    #         print(f"extracting {max_titles}reviews from url")
    #         print("Attempting to click 'See More' button...")
    #         if not self.click_button():  # If button is not found, exit loop
    #             break
    #         titles = self.collect_reviews()
    #         print(f"Total Reviews Collected: {len(titles)}")
            
    #         if len(titles) >= max_titles:
    #             break

    #         movie_data = []

    #         # Iterate through each extracted 'li' element containing movie details
    #         for movie_element in tqdm(titles,desc = 'Extracting titles and link'):
    #             # Extract the movie title from the 'h3' tag inside the element
    #             raw_title = movie_element.find_element(By.XPATH, ".//h3[contains(@class, 'ipc-title__text')]").text
    #             # print(f"Extracting: {raw_title}")

    #             # Remove leading numbers (e.g., "1. The Shawshank Redemption" → "The Shawshank Redemption")
    #             movie_title = re.sub(r"^\d+\.\s*", "", raw_title)

    #             # Extract the hyperlink to the movie's IMDb page
    #             movie_link = movie_element.find_element(By.TAG_NAME, 'a').get_attribute('href')

    #             # Append data as a tuple (no risk of overwriting)
    #             movie_data.append((movie_title, movie_link))

    #         # Convert list to DataFrame
    #         df_movies = pd.DataFrame(movie_data, columns=["Title", "IMDb Link"])

    #         # Display the number of rows
    #         print(f"Total movies extracted: {len(df_movies)}")
            
    #         df_movies = df_movies.drop_duplicates(subset=['Title']).reset_index(drop=True)
            
    #         df_movies.to_csv(f'../data/Movies_index_rating_{lower_bound}_{upper_bound}.csv',index=False)
                    
        
        
    

In [None]:
# driver.get(url)

# time.sleep(3)

In [None]:
# import time
# import logger
# import re
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait, Select
# from selenium.webdriver.support import expected_conditions as EC
# from tqdm import tqdm

# # Configure logger
# logger.basicConfig(level=logger.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# # IMDb URL template (modify this as needed)
# BASE_URL = "https://www.imdb.com/search/title/?user_rating={lower_bound},{upper_bound}&sort=num_votes,desc"

# # Define rating categories
# RATING_CATEGORIES = {
#     "Low": (1, 3),
#     "Medium": (3.1, 6),
#     "High": (6.1, 10)
# }


# # Constants
# MAX_TITLES = 2000  # Limit on movie titles
# WAIT_TIME = 5       # Wait time for loading content

# def scroll_to_element(element):
#     """Scrolls smoothly to the given element."""
#     driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", element)
#     time.sleep(2)  # Small delay for UI adjustment

# def click_see_more():
#     """Clicks the 'See More' button to load more results."""
#     try:
#         button = WebDriverWait(driver, 10).until(
#             EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ipc-see-more__button')]")
#         ))
#         scroll_to_element(button)
#         driver.execute_script("arguments[0].click();", button)
#         time.sleep(WAIT_TIME)  # Allow new content to load
#         return True
#     except Exception as e:
#         logger.warning("No more 'See More' button found or an error occurred: %s", e)
#         return False

# def collect_reviews():
#     """Collects the review elements from the page."""
#     return driver.find_elements(By.CLASS_NAME, "ipc-metadata-list-summary-item")

# def extract_movie_data(titles, rating_category):
#     """Extracts movie titles and links from the collected elements."""
#     movie_data = []
#     for movie_element in tqdm(titles, desc=f'Extracting movies ({rating_category})'):
#         try:
#             raw_title = movie_element.find_element(By.XPATH, ".//h3[contains(@class, 'ipc-title__text')]").text
#             movie_title = re.sub(r"^\d+\.\s*", "", raw_title)  # Remove ranking numbers
#             movie_link = movie_element.find_element(By.TAG_NAME, 'a').get_attribute('href')
#             movie_data.append((movie_title, movie_link, rating_category))
#         except Exception as e:
#             logger.error("Error extracting data: %s", e)
#     return pd.DataFrame(movie_data, columns=["Title", "IMDb Link", "Rating Category"])

# def scrape_movies_by_rating(lower_bound, upper_bound, category_name):
#     """Scrapes movies for a given rating range and category."""
#     url = BASE_URL.format(lower_bound=lower_bound, upper_bound=upper_bound)
#     driver.get(url)
#     time.sleep(WAIT_TIME)  # Allow initial page load

#     titles = []
#     while True:
#         if not click_see_more():  # Stop if no more pages
#             break
#         titles.extend(collect_reviews())
#         if len(titles) >= MAX_TITLES:
#             break

#     df_movies = extract_movie_data(titles, category_name)
#     df_movies.drop_duplicates(subset=['Title'], inplace=True)
#     return df_movies

# # Scrape movies for each rating category
# all_dataframes = []
# for category, (lower_bound, upper_bound) in RATING_CATEGORIES.items():
#     logger.info("Scraping movies for rating category: %s", category)
#     df = scrape_movies_by_rating(lower_bound, upper_bound, category)
#     all_dataframes.append(df)

# # Combine all categories into one DataFrame
# final_df = pd.concat(all_dataframes, ignore_index=True)
# output_file = r'C:\Users\Harish Raju\Desktop\projects\IMDB\web_scrapper\data\Movies_by_rating.csv'
# final_df.to_csv(output_file, index=False)

# logger.info("Scraping completed. Data saved to: %s", output_file)

# driver.quit()


In [None]:
# import time
# import logger
# import re
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait, Select
# from selenium.webdriver.support import expected_conditions as EC
# from tqdm import tqdm

# # Configure logger
# logger.basicConfig(level=logger.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# # IMDb URL template (modify this as needed)
# BASE_URL = "https://www.imdb.com/search/title/?user_rating={lower_bound},{upper_bound}&sort=num_votes,desc"

# # Define rating categories
# RATING_CATEGORIES = {
#     "Low": (1, 3),
#     "Medium": (3.1, 6),
#     "High": (6.1, 10)
# }

# # Selenium WebDriver setup
# # options = webdriver.ChromeOptions()
# # options.add_argument("--headless")  # Run in headless mode

# # driver = webdriver.Chrome(options=options)

# # Constants
# MAX_TITLES = 2000  # Limit on movie titles
# WAIT_TIME = 5       # Wait time for loading content

# def scroll_to_element(element):
#     """Scrolls smoothly to the given element."""
#     logger.info("Scrolling to element...")
#     driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", element)
#     time.sleep(2)  # Small delay for UI adjustment

# def click_see_more():
#     """Clicks the 'See More' button to load more results."""
#     try:
#         logger.info("Attempting to click 'See More' button...")
#         button = WebDriverWait(driver, 10).until(
#             EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ipc-see-more__button')]")
#         ))
#         scroll_to_element(button)
#         driver.execute_script("arguments[0].click();", button)
#         logger.info("Successfully clicked 'See More' button.")
#         time.sleep(WAIT_TIME)  # Allow new content to load
#         return True
#     except Exception as e:
#         logger.warning("No more 'See More' button found or an error occurred: %s", e)
#         return False

# def collect_reviews():
#     """Collects the review elements from the page."""
#     logger.info("Collecting review elements from the page...")
#     return driver.find_elements(By.CLASS_NAME, "ipc-metadata-list-summary-item")

# def extract_movie_data(titles, rating_category):
#     """Extracts movie titles and links from the collected elements."""
#     logger.info("Extracting movie data for category: %s", rating_category)
#     movie_data = []
#     for movie_element in tqdm(titles, desc=f'Extracting movies ({rating_category})'):
#         try:
#             raw_title = movie_element.find_element(By.XPATH, ".//h3[contains(@class, 'ipc-title__text')]").text
#             movie_title = re.sub(r"^\d+\.\s*", "", raw_title)  # Remove ranking numbers
#             movie_link = movie_element.find_element(By.TAG_NAME, 'a').get_attribute('href')
#             movie_data.append((movie_title, movie_link, rating_category))
#         except Exception as e:
#             logger.error("Error extracting data: %s", e)
#     logger.info("Extracted %d movies for category: %s", len(movie_data), rating_category)
#     return pd.DataFrame(movie_data, columns=["Title", "IMDb Link", "Rating Category"])

# def scrape_movies_by_rating(lower_bound, upper_bound, category_name):
#     """Scrapes movies for a given rating range and category."""
#     url = BASE_URL.format(lower_bound=lower_bound, upper_bound=upper_bound)
#     logger.info("Navigating to URL: %s", url)
#     driver.get(url)
#     time.sleep(WAIT_TIME)  # Allow initial page load

#     titles = []
#     while True:
#         if not click_see_more():  # Stop if no more pages
#             break
#         new_titles = collect_reviews()
#         titles.extend(new_titles)
#         logger.info("Total movies collected so far: %d", len(titles))
#         if len(titles) >= MAX_TITLES:
#             logger.info("Reached maximum limit of %d movies.", MAX_TITLES)
#             break

#     df_movies = extract_movie_data(titles, category_name)
#     df_movies.drop_duplicates(subset=['Title'], inplace=True)
#     logger.info("Final count of unique movies in category %s: %d", category_name, len(df_movies))
#     return df_movies

# # Scrape movies for each rating category
# all_dataframes = []
# for category, (lower_bound, upper_bound) in RATING_CATEGORIES.items():
#     logger.info("Starting scraping for rating category: %s", category)
#     df = scrape_movies_by_rating(lower_bound, upper_bound, category)
#     all_dataframes.append(df)

# # Combine all categories into one DataFrame
# final_df = pd.concat(all_dataframes, ignore_index=True)
# output_file = r'C:\Users\Harish Raju\Desktop\projects\IMDB\web_scrapper\data\Movies_by_rating.csv'
# final_df.to_csv(output_file, index=False)

# logger.info("Scraping completed. Total movies extracted: %d", len(final_df))
# logger.info("Data saved to: %s", output_file)

# driver.quit()


In [None]:
# final_df['Rating Category'].value_counts()