In [1]:
from imdb import IMDb
from imdb._exceptions import IMDbDataAccessError
import csv
import time
import threading
import os
from urllib.error import HTTPError

In [2]:
ia = IMDb()

In [3]:
def fetch_details(movie, id):
        #print(movie.keys())
        if 'title' in movie.keys():
            movie_name = movie.get('title', 'N/A')
            cover_photo = movie.get('cover url', 'N/A')
            source_link = f"https://www.imdb.com/title/tt{id:07d}/"
            rating = movie.get('rating', 'N/A')
            trailer_link = f"https://www.imdb.com/title/tt{id:07d}/videogallery"
            categories = ', '.join(movie.get('genres', []))
            return [id, movie_name, cover_photo, source_link, rating, trailer_link, categories]

In [4]:
def write_data(movie_details, http_error, file_name):
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Id', 'Movie Name', 'Cover Photo', 'Source Link', 'Rating', 'Trailer Link', 'Categories'])
        for details in movie_details:
            writer.writerow(details)
    
    with open('http_error.csv', 'a+') as file:
        writer = csv.writer(file)
        for id in http_error:
            writer.writerow(id)

In [5]:
def fetch_movie_details(start_id, end_id):
    movies = []
    http_error = []
    for id in range (start_id, end_id+1):     #making end_id inclusive
        try:
            movie = ia.get_movie(id)
            movies.append(fetch_details(movie, id))
            print(f"ID:-{id}")
        except IMDbDataAccessError as e:
            print(e)
            movies.append([id, '', '', '', '', '', ''])
            http_error.append([id,])
            print(f"Error with ID:-{id}")
        except HTTPError as e:
            http_error.append([id,])
            print('HTTPError Captured!')

    print(f"From:- {start_id}, {end_id} len is:-{len(movies)}")
    write_data(movies, http_error, f'imdb_data({start_id}-{end_id}).csv')

In [6]:
def fetch_parallel(start_id, end_id, batch_size):

    threads = []

    # batch_size = 100
    # start_id = 24555
    # end_id = 25555 
    
    current_id = start_id

    while current_id <= end_id:
        batch_end_id = min(current_id + batch_size - 1, end_id)
        print(f"Processing batch: {current_id} to {batch_end_id}")
        thread = threading.Thread(target=fetch_movie_details, args=(current_id, batch_end_id))
        threads.append(thread)
        thread.start()
        #print(f"Doing Batch:-{start_id}, {end_id}")

        current_id += batch_size
        #print(f"after Doing Batch:-{start_id}, {end_id}")
    
    for thread in threads:
        thread.join()

In [None]:
batch_size = 100
start_id = 45556       #done
end_id = 54109          #done

# for i in range(0, 2):
    
#     if i == 0:
#         start_id = 34556        #not done
#         end_id = 44555          #not done
#         print(f"Starting for Batch{start_id} to {end_id}")
#     elif i == 1:
#         start_id = 44556        #not done
#         end_id = 54109          #not done
#         print(f"Starting for Batch{start_id} to {end_id}")

fetch_parallel(start_id, end_id, batch_size)        #making end_id Inclusive

#time.sleep(600)
#print("Cooldown Period of 300 Seconds!")

# Merging CSV files

In [32]:
file_list = os.listdir()
output_file = 'MergeIMDb data(24555-54109).csv'

def merge_csv(file_list, output_file):

    mergedData = []

    for file in file_list:
        if file.endswith('.csv'):
            with open(file, 'r') as file:
                data = csv.reader(file)
                for row in data:
                    if 'Id' not in row:
                        #print(i)
                        mergedData.append(row)
    
    print(f"Lenght:-{len(mergedData)}")

    with open(output_file, 'w') as file:
        writer = csv.writer(file)
        writer.writerow(['Id', 'Movie Name', 'Cover Photo', 'Source Link', 'Rating', 'Trailer Link', 'Categories'])
        writer.writerows(mergedData)
        print("Sucessfully Created and Saved file!")
           
merge_csv(file_list, output_file)

Lenght:-59112
Sucessfully Created and Saved file!
