## Collecting the OMDB Data

In [None]:
import requests
import json
import csv

with open('../api_key.json') as f:
    keys = json.load(f)
OMDB_KEY = keys['OMDB_key']

In [6]:
# Input and output file names
input_file = "../data/input_data/file_1.csv" # File containing the list of movies
output_file = "../data/output_data/OMDB/movies_O.csv"  # New file with movie details

# Start index
START_INDEX = 0
BATCH_SIZE = 5000   # Number of movies to process in each run

# Read all movie titles from the input CSV file
with open(input_file, mode="r", encoding="utf-8") as infile:
    reader = csv.reader(infile)
    movie_titles = [row[0] for row in reader]  # Assuming titles are in the first column

# Get the subset of movies to process in this batch
movies_to_process = movie_titles[START_INDEX:START_INDEX + BATCH_SIZE]

# Open the output file in append mode to continue adding data
with open(output_file, mode="a", newline="", encoding="utf-8") as outfile:
    writer = csv.writer(outfile)

    # If the file is new, add headers
    if START_INDEX == 0:  # Only add headers on the first run
        writer.writerow([
            "Title", "Year", "Released", "Genre", "Runtime", "Director", "Writer", "Actors",
            "Rated", "IMDB Rating", "Metascore", "IMDB Votes", "Awards", "Box Office",
            "Country", "Language", "Description", "Poster URL"
        ])

    for movie_title in movies_to_process:
        # Construct the OMDB API request URL
        url = f"http://www.omdbapi.com/?t={movie_title}&apikey={OMDB_KEY}&plot=full"

        # Send request
        response = requests.get(url)

        if response.status_code == 200:  # If request is successful
            movie_data = response.json()

            if movie_data['Response'] == 'True':
                # Write movie details to CSV
                writer.writerow([
                    movie_data.get('Title', 'N/A'),
                    movie_data.get('Year', 'N/A'),
                    movie_data.get('Released', 'N/A'),
                    movie_data.get('Genre', 'N/A'),
                    movie_data.get('Runtime', 'N/A'),
                    movie_data.get('Director', 'N/A'),
                    movie_data.get('Writer', 'N/A'),
                    movie_data.get('Actors', 'N/A'),
                    movie_data.get('Rated', 'N/A'),
                    movie_data.get('imdbRating', 'N/A'),
                    movie_data.get('Metascore', 'N/A'),
                    movie_data.get('imdbVotes', 'N/A'),
                    movie_data.get('Awards', 'N/A'),
                    movie_data.get('BoxOffice', 'N/A'),
                    movie_data.get('Country', 'N/A'),
                    movie_data.get('Language', 'N/A'),
                    movie_data.get('Plot', 'N/A'),
                    movie_data.get('Poster', 'N/A')
                ])
            else:
                print(f"Movie not found: {movie_title}")
        else:
            print(f"Failed to fetch data for: {movie_title}")

print(f"Processed {len(movies_to_process)} movies. Data saved to {output_file}")

Movie not found: Den of Thieves 2: Pantera
Movie not found: 9Â½ Weeks
Movie not found: Now You See Me 3
Movie not found: Wicked: Part Two
Movie not found: Wicked: Part I
Failed to fetch data for: Juror #2
Movie not found: The Naked Gun 2Â½: The Smell of Fear
Movie not found: Indiana Jones and the Raiders of the Lost Ark
Movie not found: Levon's Trade
Movie not found: Big Stan
Movie not found: Another 9Â½ Weeks
Movie not found: Biohazard: Death Island
Failed to fetch data for: #FamilyMan
Movie not found: The ProtÃ©gÃ©
Movie not found: Welcome To Holland
Processed 5000 movies. Data saved to ../data/output_data/OMDB/movies_O.csv


## Collecting the TMBD Data

In [None]:
with open('../api_key.json') as f:
    keys = json.load(f)
TMDB_KEY = keys['TMDB_key']

In [None]:
# Input and output file names
input_file = "../data/input_data/file_1.csv"         # File containing the list of movies
output_file = "../data/output_data/TMDB/movies_T.csv"  # New file with movie details

# Start index
START_INDEX = 0  # Process from movie #1001 onward
BATCH_SIZE = 5000   # Number of movies to process in each run

# Read all movie titles from the input CSV file
with open(input_file, mode="r", encoding="utf-8") as infile:
    reader = csv.reader(infile)
    movie_titles = [row[0] for row in reader]  # Assuming titles are in the first column

# Get the subset of movies to process in this batch
movies_to_process = movie_titles[START_INDEX:START_INDEX + BATCH_SIZE]

# Open the output file in append mode to continue adding data
with open(output_file, mode="a", newline="", encoding="utf-8") as outfile:
    writer = csv.writer(outfile)

    # If the file is new, add headers
    if START_INDEX == 0:
        writer.writerow([
            "Title", "Tagline", "Overview", "Budget", "Revenue", "TMDB Rating", "Vote Count", "Production Companies"
        ])

    for movie_title in movies_to_process:
        # Search for the movie by title
        search_url = "https://api.themoviedb.org/3/search/movie"
        params = {"api_key": TMDB_KEY, "query": movie_title}
        search_response = requests.get(search_url, params=params)

        if search_response.status_code == 200 and search_response.json()["results"]:
            movie_id = search_response.json()["results"][0]["id"]

            # Get detailed info
            details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
            details_params = {"api_key": TMDB_KEY}
            details_response = requests.get(details_url, params=details_params)

            if details_response.status_code == 200:
                data = details_response.json()

                title = data.get("title", "N/A")
                tagline = data.get("tagline", "N/A")
                overview = data.get("overview", "N/A")
                budget = data.get("budget", "N/A")
                revenue = data.get("revenue", "N/A")
                vote_average = data.get("vote_average", "N/A")
                vote_count = data.get("vote_count", "N/A")
                production_companies = ", ".join([c["name"] for c in data.get("production_companies", [])])

                writer.writerow([
                    title,
                    tagline,
                    overview,
                    budget,
                    revenue,
                    vote_average,
                    vote_count,
                    production_companies
                ])

                print(f"✓ Processed: {movie_title}")
            else:
                print(f"✗ Failed to get details for: {movie_title}")
        else:
            print(f"✗ Movie not found: {movie_title}")

print(f"Processed {len(movies_to_process)} movies.")

## Combining both the datasets

In [None]:
import pandas as pd

# Reading both of the CSV files
OMDB_movies = pd.read_csv("../data/output_data/OMDB/movies_O.csv")
TMDB_movies = pd.read_csv("../data/output_data/TMDB/movies_T.csv")

# Merging the dataframes on the movie title
merged_df = pd.merge(OMDB_movies, TMDB_movies, on="Title")

# exporting merged csv
merged_df.to_csv("../data/output_data/combined_movies.csv", index=False)

## Exploring the Collected Data

In [None]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("../data/output_data/combined_movies.csv")

In [None]:
variables = movies_df.columns
print("Features of the movie:", variables.values)

In [None]:
movies_df['Year'] = pd.to_numeric(movies_df['Year'], errors='coerce')

movies_df = movies_df[(movies_df['Year'] >= 1984) & (movies_df['Year'] <= 2024)]

years = movies_df['Year']
min_year = np.min(years)
max_year = np.max(years)
print("Earlist film year:", min_year)
print("Most recent film year:", max_year)

In [None]:
movies = len(movies_df)
print("Number of movies:", movies)