In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Web scraping and API
import requests
import imdb

# Concurrent processing
import threading
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

# Utilities
import time

In [2]:

# Load the dataset
df = pd.read_csv('data/dataset_final.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst few rows of the dataset:")
display(df.head())
print("\nDataset Info:")
df.info()


Dataset Shape: (76550, 46)

First few rows of the dataset:


Unnamed: 0.1,Unnamed: 0,movie_title,movie_release,movie_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,movie_wikidata_id,imdb_rating,...,book_won_price,book_rating,book_publisher,book_ratings_count,book_pages,movie_is_adaptation,time_gap,revenue_budget_ratio,movie_revenue_log,movie_budget_log
0,0,Ghosts of Mars,2001.0,1028672.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...",Q261700,4.9,...,,,,,260.0,False,,0.500387,6.012277,6.312971
1,1,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",Q16250726,6.1,...,,,,,260.0,False,,,0.0,0.0
2,2,Brun bitter,1988.0,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",Q4978832,5.6,...,,,,,260.0,False,,,0.0,0.0
3,3,White Of The Eye,1987.0,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",Q7995657,6.1,...,,,,,260.0,False,,,0.0,0.0
4,4,A Woman in Flames,1983.0,,106.0,['German'],['Germany'],['Drama'],Q869644,5.9,...,,,,,260.0,False,,,0.0,0.0



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76550 entries, 0 to 76549
Data columns (total 46 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            76550 non-null  int64  
 1   movie_title           76550 non-null  object 
 2   movie_release         70771 non-null  float64
 3   movie_revenue         10568 non-null  float64
 4   movie_runtime         76550 non-null  float64
 5   movie_languages       76550 non-null  object 
 6   movie_countries       76550 non-null  object 
 7   movie_genres          76550 non-null  object 
 8   movie_wikidata_id     76549 non-null  object 
 9   imdb_rating           63109 non-null  float64
 10  imdb_total_votes      63109 non-null  float64
 11  movie_budget          14798 non-null  float64
 12  book_wikidata_id      4904 non-null   object 
 13  book_title            4904 non-null   object 
 14  book_author           4811 non-null   object 
 15  book

In [199]:
df_adaptations = df[df['movie_is_adaptation'] == True]

# Display basic information about the adaptations dataset
print("\nAdaptations Dataset Shape:", df_adaptations.shape)
print("\nFirst few rows of the adaptations dataset:")
display(df_adaptations.head())



Adaptations Dataset Shape: (4904, 46)

First few rows of the adaptations dataset:


Unnamed: 0.1,Unnamed: 0,movie_title,movie_release,movie_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,movie_wikidata_id,imdb_rating,imdb_total_votes,movie_budget,book_wikidata_id,book_title,book_author,book_release,book_country,book_part_of_series,literary_work,written_work,comic_book_seris,book_series,manga_series,book_fiction,book_non_fiction,book_children,book_historical,book_drama,book_anime,book_fantasy,book_science_fiction,book_horror,book_thriller,book_detective,book_satire,book_comedy,book_won_price,book_rating,book_publisher,book_ratings_count,book_pages,movie_is_adaptation,time_gap,revenue_budget_ratio,movie_revenue_log,movie_budget_log
10,10,Lady Snowblood 2: Love Song of Vengeance,1974.0,,89.0,['Japanese'],['Japan'],"['Crime Fiction', 'Thriller', 'Japanese Movies...",Q840296,6.3,4356.0,,Q16931816,Lady Snowblood,,,Japan,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,260.0,True,,,0.0,0.0
40,40,Mysterious Island,1982.0,,100.0,['Standard Mandarin'],['Hong Kong'],"['Action/Adventure', 'Wuxia', 'Martial Arts Fi...",Q7719877,5.4,211.0,,Q1187628,The Return of the Condor Heroes,Jin Yong,1959.0,Hong Kong,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,260.0,True,23.0,,0.0,0.0
41,41,Woman Hungry,1930.0,,65.0,['English'],['United States of America'],"['Musical', 'Western']",Q3569754,,,,Q79188100,The Great Divide,William Vaughn Moody,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,260.0,True,,,0.0,0.0
61,61,Juarez,1939.0,,125.0,"['English', 'Spanish']",['United States of America'],"['Costume drama', 'Biographical film', 'Histor...",Q1710735,6.9,2586.0,,Q100975751,Juarez and Maximilian,Franz Werfel,1925.0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,260.0,True,14.0,,0.0,0.0
78,78,The Tango Player,1991.0,,96.0,['German'],['Germany'],['Drama'],Q7768095,6.2,46.0,,Q1197316,The tango player,Christoph Hein,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.48,Farrar Straus Giroux,121.0,219.0,True,,,0.0,0.0


In [204]:
def fetch_movie_info(movie_title, retry_count=3, base_delay=2):
    """
    Fetch information for a single movie with exponential backoff retry
    """
    for attempt in range(retry_count):
        try:
            # Add exponential backoff delay
            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
            time.sleep(delay)
            
            ia = imdb.IMDb()
            ia.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
            movies = ia.search_movie(movie_title)
            
            if movies:
                movie_id = movies[0].movieID
                movie = ia.get_movie(movie_id)
                
                result = {
                    'title': movie_title,
                    'year': movie.get('year', None),
                    'rating': movie.get('rating', None),
                    'votes': movie.get('votes', None),
                    'genres': ','.join(movie.get('genres', [])) if movie.get('genres') else None,
                    'plot': movie.get('plot', [None])[0] if movie.get('plot') else None,
                    'imdb_id': f"tt{movie_id}"
                }
                print(f"Successfully processed: {movie_title}")
                return result
            else:
                print(f"No results found for: {movie_title}")
                return None
                
        except imdb.IMDbError as e:
            if e.code == 403:
                if attempt < retry_count - 1:
                    print(f"Rate limit hit for {movie_title}, retrying... (attempt {attempt + 1})")
                    continue
                else:
                    print(f"Max retries reached for {movie_title}")
            print(f"HTTP Error processing {movie_title}: {str(e)}")
            return None
            
        except Exception as e:
            print(f"Error processing {movie_title}: {str(e)}")
            return None

def fetch_movies_parallel(movie_titles, max_workers=10):
    """
    Fetch movie data using thread pool with reduced concurrency
    """
    results = []
    
    print(f"Starting parallel fetch with {max_workers} threads...")
    
    # Split movies into smaller batches
    batch_size = 20
    batches = [movie_titles[i:i + batch_size] for i in range(0, len(movie_titles), batch_size)]
    
    for batch in batches:
        print(f"\nProcessing batch of {len(batch)} movies...")
        
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit batch of tasks
            future_to_movie = {executor.submit(fetch_movie_info, title): title 
                             for title in batch}
            
            # Process completed futures as they finish
            for future in concurrent.futures.as_completed(future_to_movie):
                movie_title = future_to_movie[future]
                try:
                    result = future.result()
                    if result:
                        results.append(result)
                except Exception as e:
                    print(f"Error processing {movie_title}: {str(e)}")
        
        # Add delay between batches
        time.sleep(5)
    
    return results

# Main execution
def main():
    # Load data and filter for adaptations
    df_adaptations = df[df['movie_is_adaptation']==True]
    movie_titles = df_adaptations['movie_title'].tolist()

    print(f"Starting to process {len(movie_titles)} movies...")
    start_time = time.time()
    
    # Fetch data using thread pool
    results = fetch_movies_parallel(movie_titles, max_workers=10)
    
    # Create DataFrame from results
    if results:
        results_df = pd.DataFrame(results)
        
        # Save results
        output_file = 'movie_details.csv'
        results_df.to_csv(output_file, index=False)
        
        end_time = time.time()
        print(f"\nProcessing completed in {end_time - start_time:.2f} seconds")
        print(f"Retrieved data for {len(results)} out of {len(movie_titles)} movies")
        print(f"Results saved to {output_file}")
        
        return results_df
    else:
        print("No results were obtained")
        return None

# Run the script with imports
if __name__ == '__main__':
    import random
    from urllib.error import HTTPError
    
    results_df = main()

Starting to process 4904 movies...
Starting parallel fetch with 10 threads...

Processing batch of 20 movies...
Successfully processed: Woman Hungry
Successfully processed: Straw Dogs
Successfully processed: Carmen
Successfully processed: The Tango Player
Successfully processed: Juarez
Successfully processed: Good Morning Miss Dove
Successfully processed: The Great Santini
Successfully processed: Lady Snowblood 2: Love Song of Vengeance
Successfully processed: Mysterious Island
Successfully processed: The Thirteen Chairs
Successfully processed: The Snow Queen
Successfully processed: Pedro Páramo
Successfully processed: Ken Russell's Fall of the Louse of Usher
Successfully processed: Lord of the Flies
Successfully processed: If They Tell You I Fell
Successfully processed: Midnight Express
Successfully processed: On the Beach
Successfully processed: The Dark Half
Successfully processed: Chaplin
Successfully processed: The Forbidden Kingdom

Processing batch of 20 movies...
Successfully p

In [205]:
results_imdbpy= pd.read_csv('movie_details.csv')
print(results_imdbpy.shape)

(4897, 7)


In [206]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from ratelimit import limits, sleep_and_retry

# Rate limit: 5 calls per second
@sleep_and_retry
@limits(calls=5, period=1)
def get_movie_budget_from_imdb_html(imdb_id):
    """
    Extract budget information from IMDb page using BeautifulSoup
    """
    url = f"https://www.imdb.com/title/{imdb_id}/"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'lxml')  # Using lxml parser for better performance
        
        budget_element = soup.select_one('li[data-testid="title-boxoffice-budget"] span.ipc-metadata-list-item__list-content-item')

        boxoffice_element = soup.select_one('li[data-testid="title-boxoffice-cumulativeworldwidegross"] span.ipc-metadata-list-item__list-content-item')
        
        tuple_element= {'budget':None, 'boxoffice':None}
        if budget_element:
            budget_text = budget_element.text.strip()
            budget_amount = re.search(r'\$?([\d,]+)', budget_text)
            if budget_amount:
                tuple_element['budget']=int(budget_amount.group(1).replace(',', ''))

        if boxoffice_element:
            boxoffice_text = boxoffice_element.text.strip()
            boxoffice_amount = re.search(r'\$?([\d,]+)', boxoffice_text)
            if boxoffice_amount:
                tuple_element['boxoffice']=int(boxoffice_amount.group(1).replace(',', ''))
        
        return tuple_element
        
    except Exception as e:
        print(f"Error fetching budget for {imdb_id}: {str(e)}")
        return None

def process_movie(row):
    """
    Process a single movie row
    """
    imdb_id = row['imdb_id']
    
    if pd.isna(imdb_id):
        print(f"No IMDb ID for movie: {row['title']}")
        return None
        
    tuple_element = get_movie_budget_from_imdb_html(imdb_id)
    
    return {
        'title': row.get('title'),
        'imdb_id': imdb_id,
        'existing_budget': row.get('movie_budget'),
        'scraped_budget': tuple_element['budget'],
        'scraped_boxoffice': tuple_element['boxoffice'],
        'release_year': row.get('movie_release')
    }

def batch_process_movies_parallel(df, num_movies=10, max_workers=10):
    """
    Process movies in parallel using ThreadPoolExecutor
    """
    movies_subset = df
    results = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create future tasks
        future_to_row = {
            executor.submit(process_movie, row): row 
            for _, row in movies_subset.iterrows()
        }
        
        # Process completed tasks with progress bar
        for future in tqdm(as_completed(future_to_row), total=len(movies_subset)):
            result = future.result()
            if result:
                results.append(result)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    if not results_df.empty:
        # Add comparison column
        results_df['budget_difference'] = results_df['existing_budget'] - results_df['scraped_budget']
    
    return results_df

def analyze_results(results_df):
    """
    Analyze and print detailed results
    """
    print("\nResults:")
    print(results_df.to_string())
    
    print("\nSummary:")
    print(f"Total movies processed: {len(results_df)}")
    print(f"Movies with scraped budget: {results_df['scraped_budget'].notna().sum()}")
    print(f"Movies with matching budgets: {(results_df['budget_difference'] == 0).sum()}")
    
    # Check for missing budgets
    missing_budgets = results_df[results_df['scraped_budget'].isna()]
    if not missing_budgets.empty:
        print("\nMovies with missing IMDb budgets:")
        print(missing_budgets[['title', 'imdb_id']].to_string())
    
    # Check for large discrepancies
    significant_diff = results_df[abs(results_df['budget_difference']) > 1000000]
    if not significant_diff.empty:
        print("\nMovies with significant budget differences:")
        print(significant_diff[['title', 'existing_budget', 'scraped_budget', 'budget_difference']].to_string())

if __name__ == "__main__":
    # Install required packages if needed
    # !pip install ratelimit lxml
    
    # Run the parallel batch process
    results = batch_process_movies_parallel(
        results_imdbpy, 
        num_movies=10,
        max_workers=13  # Adjust based on your needs
    )
    
    # Analyze results
    analyze_results(results)
    
    # Save results to CSV
    results.to_csv('data/imdb_budget_comparison.csv', index=False)

100%|██████████| 4897/4897 [16:33<00:00,  4.93it/s]


Results:
                                                                                               title     imdb_id existing_budget  scraped_budget  scraped_boxoffice release_year budget_difference
0                                                                                   The Tango Player   tt0100742            None             NaN                NaN         None               NaN
1                                                                                       Woman Hungry   tt0471871            None             NaN                NaN         None               NaN
2                                                                                             Juarez   tt0031516            None             NaN                NaN         None               NaN
3                                                                                             Carmen   tt6875952            None             NaN       3.839960e+05         None               NaN
4              


