In [9]:
import pandas as pd
import tmdbsimple as tmdb 
tmdb.API_KEY = "9f3a656bc10a7241687aba82819a8c67"

In [10]:
# search if movie in 'thenumbers' dataset is in the tmdb dataset. returns list of movies
def searchMovie(title, year, genres): # might have to exclude genres depending on tmdb dataset
    search = tmdb.Search()

    search.movie(query=title, primary_release_year=year)
    genres = getGenreIDs(genres)
    if search.results:
        final_out = []
        for result in search.results:
            if result.get('genre_ids') == genres: # exact matching the genres 
                movie = tmdb.Movies(result.get('id'))   
                final_out.append(movie.info())
    #     movieActors = {}
    #     for result in search.results:
    #         movieTitle = result.get('original_title')

    #         movieID = result.get('id')
    #         result.get()
    #         listOfActors = []
    #         for actor in cast: 
    #             listOfActors.append(actor.get('name'))
            
    #         movieActors[movieTitle] = listOfActors
    #     return movieActors

        return final_out
    return "No movie found"


# converts genre names to genre IDs
def getGenreIDs(genres): # this will work provided 'thenumbers' dataset has 'genre' field, and has the same genre names, and same genres as tmdb. 

    # Obtain dict of movie genres (Key:Value = genre_Name:genre_ID)
    movieGenres = tmdb.Genres().movie_list()
    movieGenres = {movieGenres['name']: movieGenres['id'] for movieGenres in movieGenres['genres']}
    
    genres = [x.strip() for x in genres.split(',')]
    genres = [movieGenres.get(genre) for genre in genres]
    return genres


# enter the data of searched movies into a pandas dataframe; 
def convertToDF(searchedMovies):
    if not searchedMovies:  # Handle empty input
        return pd.DataFrame()

    # Extract all keys dynamically from all movies
    all_keys = set().union(*(movie.keys() for movie in searchedMovies))

    data = []
    for movie in searchedMovies:
        movieData = {key: movie.get(key) for key in all_keys} # returns None if key not found

        # Rename 'id' to 'movie_id'
        if 'id' in movieData:
            movieData['movie_id'] = movieData.pop('id')

        # Flatten complex fields and store them as lists
        if 'genres' in movie:
            movieData['genres'] = [genre['name'] for genre in movie.get('genres', [])]
            movieData['genre_ids'] = [genre['id'] for genre in movie.get('genres', [])]

        if 'production_companies' in movie:
            movieData['production_companies'] = [comp['name'] for comp in movie.get('production_companies', [])]

        if 'spoken_languages' in movie:
            movieData['spoken_languages'] = [lang['english_name'] for lang in movie.get('spoken_languages', [])]

        if 'production_countries' in movie:
            movieData['production_countries'] = [country['name'] for country in movie.get('production_countries', [])]

        # Extract the collection name (if available)
        if 'belongs_to_collection' in movie and movie['belongs_to_collection']:
            movieData['belongs_to_collection'] = movie['belongs_to_collection'].get('name', None)
        else:
            movieData['belongs_to_collection'] = None  # If no collection, set to None

        data.append(movieData)

    df = pd.DataFrame(data)
    return df

# transform the data and create features

# feature 1: weekend / weekday; binary
# feature 2: public holiday; binary 
# feature 4: day of week; int / str
# feature 5: month; int / str 
# feature 6: genres; list(str)
# feature 7: size of crew; int
# feature 8: size of cast; int
# feature 9: top director; key:value
# feature 10: top producer; key:value
# feature 11: top actor; key:value (could be top x as well)
# feature 13: top actress; key:value (could be top x as well)
# feature 14: total cast popularity score
# feature 15: total crew popularity score

import holidays

def extract_features(df, date_column, country='US'):

    df[date_column] = pd.to_datetime(df[date_column])  

    # Feature 1: Weekend / Weekday (Binary)
    df['is_weekend'] = df[date_column].dt.weekday.isin([5, 6]).astype(int)

    # Feature 2: Public Holiday (Based on Country)
    country_holidays = holidays.country_holidays(country)
    df['is_public_holiday'] = df[date_column].apply(lambda x: int(x in country_holidays))

    # Feature 4: Day of the Week
    df['day_of_week_str'] = df[date_column].dt.day_name()

    # Feature 5: Month
    df['month_str'] = df[date_column].dt.month_name()

    # Extract movie features
    movie_features = df['movie_id'].apply(lambda movie_id: extract_movie_features(movie_id) if movie_id else {})

    # Convert movie_features (which is a series of dicts) into a DataFrame
    movie_features_df = pd.DataFrame(movie_features.tolist())

    # Merge both date and movie features
    df = pd.concat([df, movie_features_df], axis=1)

    return df

def extract_movie_features(movie_id):

    movie = tmdb.Movies(movie_id)
    # Extract cast and crew lists
    cast = movie.credits().get('cast', [])
    crew = movie.credits().get('crew', [])

    # Feature 7: Size of Crew
    crew_size = len(crew)

    # Feature 8: Size of Cast
    cast_size = len(cast)

    # Feature 9: Most Popular Director
    directors = [member for member in crew if member.get('job') == 'Director']
    top_director = max(directors, key=lambda x: x.get('popularity', 0), default={}).get('name', None)

    # Feature 10: Most Popular Producer
    producers = [member for member in crew if member.get('job') == 'Producer']
    top_producer = max(producers, key=lambda x: x.get('popularity', 0), default={}).get('name', None)

    # Feature 11: Top Actor (Highest popularity)
    top_actor = max(cast, key=lambda x: x.get('popularity', 0), default={}).get('name', None)

    # Feature 13: Top Actress (Highest popularity, assuming gender 1 = female)
    female_actors = [member for member in cast if member.get('gender') == 1]
    top_actress = max(female_actors, key=lambda x: x.get('popularity', 0), default={}).get('name', None)

    # Feature 14: Total Cast Popularity Score
    total_cast_popularity = sum(member.get('popularity', 0) for member in cast)

    # Feature 15: Total Crew Popularity Score
    total_crew_popularity = sum(member.get('popularity', 0) for member in crew)

    return {
        'crew_size': crew_size,
        'cast_size': cast_size,
        'total_crew_popularity': total_crew_popularity,
        'total_cast_popularity': total_cast_popularity,
        'top_director': top_director,
        'top_producer': top_producer,
        'top_actor': top_actor,
        'top_actress': top_actress
    }



In [11]:
# example movie search 

searched_movies = searchMovie('Gladiator', 2000, 'Action, Drama, Adventure')
print(searched_movies)

[{'adult': False, 'backdrop_path': '/Ar7QuJ7sJEiC0oP3I8fKBKIQD9u.jpg', 'belongs_to_collection': {'id': 1069584, 'name': 'Gladiator Collection', 'poster_path': '/bk6nx2rGNdlKtBsB9XcrclVKItv.jpg', 'backdrop_path': '/1VdLvSIeHuwqCT13H9EafxCacGB.jpg'}, 'budget': 103000000, 'genres': [{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 12, 'name': 'Adventure'}], 'homepage': '', 'id': 98, 'imdb_id': 'tt0172495', 'origin_country': ['US'], 'original_language': 'en', 'original_title': 'Gladiator', 'overview': "After the death of Emperor Marcus Aurelius, his devious son takes power and demotes Maximus, one of Rome's most capable generals who Marcus preferred. Eventually, Maximus is forced to become a gladiator and battle to the death against other men for the amusement of paying audiences.", 'popularity': 58.496, 'poster_path': '/ty8TGRuvJLPUmAR1H1nRIsgwvim.jpg', 'production_companies': [{'id': 33, 'logo_path': '/3wwjVpkZtnog6lSKzWDjvw2Yi00.png', 'name': 'Universal Pictures', 'orig

In [12]:
# example of getting movie info

movie = tmdb.Movies(550)
movie.info()

{'adult': False,
 'backdrop_path': '/hZkgoQYus5vegHoetLkCJzb17zJ.jpg',
 'belongs_to_collection': None,
 'budget': 63000000,
 'genres': [{'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.foxmovies.com/movies/fight-club',
 'id': 550,
 'imdb_id': 'tt0137523',
 'origin_country': ['US'],
 'original_language': 'en',
 'original_title': 'Fight Club',
 'overview': 'A ticking-time-bomb insomniac and a slippery soap salesman channel primal male aggression into a shocking new form of therapy. Their concept catches on, with underground "fight clubs" forming in every town, until an eccentric gets in the way and ignites an out-of-control spiral toward oblivion.',
 'popularity': 52.681,
 'poster_path': '/pB8BM7pdSp6B6Ih7QZ4DrQ3PmJK.jpg',
 'production_companies': [{'id': 711,
   'logo_path': '/tEiIH5QesdheJmDAqQwvtN60727.png',
   'name': 'Fox 2000 Pictures',
   'origin_country': 'US'},
  {'id': 508,
   'logo_path': '/7cxRWzi4LsVm4Utfpr1hfARNurT.png',
   'name': 'Regency Enterprises',
   'origin_co

In [13]:
# example of getting movie crew 

movie.credits().get('crew')[0:5]
# getGenreIDs('Action, Drama, Adventure')

[{'adult': False,
  'gender': 2,
  'id': 7474,
  'known_for_department': 'Production',
  'name': 'Ross Grayson Bell',
  'original_name': 'Ross Grayson Bell',
  'popularity': 0.051,
  'profile_path': None,
  'credit_id': '52fe4250c3a36847f8014a05',
  'department': 'Production',
  'job': 'Producer'},
 {'adult': False,
  'gender': 1,
  'id': 7475,
  'known_for_department': 'Production',
  'name': 'Ceán Chaffin',
  'original_name': 'Ceán Chaffin',
  'popularity': 1.759,
  'profile_path': '/rk8DPo1CAMTWL0YjRsMFIy3gRmT.jpg',
  'credit_id': '52fe4250c3a36847f8014a0b',
  'department': 'Production',
  'job': 'Producer'},
 {'adult': False,
  'gender': 2,
  'id': 1254,
  'known_for_department': 'Production',
  'name': 'Art Linson',
  'original_name': 'Art Linson',
  'popularity': 2.158,
  'profile_path': '/p9JPMQt4RRHYdQgvnrmFNiZ4eW6.jpg',
  'credit_id': '52fe4250c3a36847f8014a11',
  'department': 'Production',
  'job': 'Producer'},
 {'adult': False,
  'gender': 2,
  'id': 7477,
  'known_for_depa

In [14]:
# start of pipeline

# 1. convert all searched movies into a pandas dataframe
movies_df = convertToDF(searched_movies)

# 2. extract features from the dataframe
movies_df_extracted = extract_features(movies_df, 'release_date', country='US')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(f'Df columns: {movies_df_extracted.columns}\n')
print(f'Dataframe with features:\n{movies_df_extracted.head()}')
pd.reset_option('display.max_columns')

Df columns: Index(['overview', 'vote_count', 'original_title', 'production_companies', 'video', 'tagline', 'poster_path', 'vote_average', 'revenue', 'genres', 'status', 'original_language', 'release_date', 'budget', 'popularity', 'origin_country', 'belongs_to_collection', 'spoken_languages', 'runtime', 'adult', 'imdb_id', 'production_countries', 'backdrop_path', 'homepage', 'title', 'movie_id', 'genre_ids', 'is_weekend', 'is_public_holiday', 'day_of_week_str', 'month_str', 'crew_size', 'cast_size', 'total_crew_popularity', 'total_cast_popularity', 'top_director', 'top_producer', 'top_actor', 'top_actress'], dtype='object')

Dataframe with features:
                                            overview  vote_count original_title                               production_companies  video                                 tagline                       poster_path  vote_average    revenue                      genres    status original_language release_date     budget  popularity origin_country