In [9]:
import pandas as pd
import os
import re
import requests
import json

# Extract and clean BFI box office data

In [10]:
def film_dataframe_prep(folder_path, file_name):
  file_path = os.path.join(folder_path, file_name)

  # Read xls file; ignore first row which contains irrelevant text
  df = pd.read_excel(file_path, skiprows=1)

  # Remove rows for which Rank is null, i.e., rows without box office information
  films = df[~df.Rank.isnull()].reset_index(drop=True)

  # Remove empty columns and columns containing non-box office information
  films.drop(columns=[col for col in films.columns if col.startswith('Unnamed:')],
             inplace=True)

  # Add Source File column
  films['Source File'] = file_name

  # Extract the date using regular expressions
  date_match = re.search(r'(\d{4}-\d{2}-\d{2})', file_name)
  films['Weekend Commencing'] = date_match.group(1)

  return films

In [11]:
# Obtain list of box office figures file names
film_dfs = []
# The path for the folder where the box office figure xls files are stored
folder_path = '/workspaces/box-office-figures/data'

# Iterate through the files in the folder
for file_name in os.listdir(folder_path):
  film_dfs.append(film_dataframe_prep(folder_path, file_name))

FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/box-office-figures/data'

In [None]:
# Concatenate data from all files into a dataframe
all_films = pd.concat(film_dfs, ignore_index=True)

The all_films dataframe contains data from all of the files uploaded to the data folder. This would allow an analyst to view how a film has performed at the box office over a period of time.

In [None]:
all_films

Unnamed: 0,Rank,Film,Country of Origin,Weekend Gross,Distributor,% change on last week,Weeks on release,Number of cinemas,Site average,Total Gross to date,Source File,Weekend Commencing
0,1.0,It Ends With Us,USA,4516760,Sony Pictures,-,1.0,625.0,7227.0,4516760.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
1,2.0,Deadpool & Wolverine,UK/USA,4083378,Disney,-0.49,3.0,713.0,5727.0,42986728.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
2,3.0,Despicable Me 4,USA,1528941,Universal,-0.4,5.0,718.0,2129.0,35552993.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
3,4.0,Trap,USA,1141334,Warner Bros,-,1.0,526.0,2170.0,1141334.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
4,5.0,Borderlands,USA,843159,Lionsgate,-,1.0,540.0,1561.0,843159.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
...,...,...,...,...,...,...,...,...,...,...,...,...
122,23.0,Mr. Bachchan,Ind,12316,Dreamz Entertainment,,1.0,16.0,770.0,12316.0,bfi-weekend-box-office-report-2024-08-16-18.xls,2024-08-16
123,27.0,Nunakuzhi (Ireland),Ind,10067,2G Entertainments,,1.0,7.0,1438.0,10067.0,bfi-weekend-box-office-report-2024-08-16-18.xls,2024-08-16
124,29.0,Lone Star (4K Restoration),USA,9731,Park Circus,,1.0,18.0,541.0,9731.0,bfi-weekend-box-office-report-2024-08-16-18.xls,2024-08-16
125,33.0,Ryan's World The Movie: Titan Universe Adventure,USA,8828,Vue Entertainment,,1.0,85.0,104.0,8828.0,bfi-weekend-box-office-report-2024-08-16-18.xls,2024-08-16


# Extract genre information using the TMDB API

This section of the notebook will not work unless you have an access token generated for the TMDB API. Please see README.md for more information.

## Extract genre list

TMDB gives genre IDs, so we will need to extract the id: genre mapping

In [None]:
# Read access token for the API
api_rat = os.environ['TMDB_READ_ACCESS_TOKEN']

In [None]:
url = "https://api.themoviedb.org/3/genre/movie/list?language=en"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer " + api_rat
}

genres = requests.get(url, headers=headers)

print(genres.text)

{"genres":[{"id":28,"name":"Action"},{"id":12,"name":"Adventure"},{"id":16,"name":"Animation"},{"id":35,"name":"Comedy"},{"id":80,"name":"Crime"},{"id":99,"name":"Documentary"},{"id":18,"name":"Drama"},{"id":10751,"name":"Family"},{"id":14,"name":"Fantasy"},{"id":36,"name":"History"},{"id":27,"name":"Horror"},{"id":10402,"name":"Music"},{"id":9648,"name":"Mystery"},{"id":10749,"name":"Romance"},{"id":878,"name":"Science Fiction"},{"id":10770,"name":"TV Movie"},{"id":53,"name":"Thriller"},{"id":10752,"name":"War"},{"id":37,"name":"Western"}]}


In [None]:
genres_json = json.loads(genres.text)
genres_list = genres_json['genres']
genres_df = pd.DataFrame(genres_list).set_index('id')
genres_df

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
28,Action
12,Adventure
16,Animation
35,Comedy
80,Crime
99,Documentary
18,Drama
10751,Family
14,Fantasy
36,History


## Add genre column to all_films dataframe

In [None]:
def tmdb_film_search(film):
  # This function searches for the film in TMDB
  url = "https://api.themoviedb.org/3/search/movie?query=" + film + "&include_adult=false&language=en-US&page=1"

  headers = {
      "accept": "application/json",
      "Authorization": "Bearer " + api_rat
  }

  response = requests.get(url, headers=headers)
  return response

In [None]:
def find_genre(api_response, genres_df):
  # This function takes a TMDB API response and a dataframe of genres and returns the genre(s) as a string. 
  response_json = json.loads(api_response.text)

  # Returns an empty string if the TMDB search returns zero results
  if response_json['total_results'] == 0:
    genres_string = ""
  else:
    genre_id_list = response_json['results'][0]['genre_ids']
    # Extracts the genre names from genres_df we created earlier
    genres_str_list = genres_df.lloc[genre_id_list, 'name'].tolist()
    genres_string = ", ".join(genres_str_list)
  return genres_string

In [12]:
all_films['Genre'] = all_films.Film.apply(tmdb_film_search).apply(find_genre, args=(genres_df))

NameError: name 'all_films' is not defined