In [1]:
import pandas as pd
import os
import re

In [4]:
def film_dataframe_prep(folder_path, file_name):
  file_path = os.path.join(folder_path, file_name)

  # Read xls file; ignore first row which contains irrelevant text
  df = pd.read_excel(file_path, skiprows=1)

  # Remove rows for which Rank is null, i.e., rows without box office information
  films = df[~df.Rank.isnull()].reset_index(drop=True)

  # Remove empty columns and columns containing non-box office information
  films.drop(columns=[col for col in films.columns if col.startswith('Unnamed:')],
             inplace=True)

  # Add Source File column
  films['Source File'] = file_name

  # Extract the date using regular expressions
  date_match = re.search(r'(\d{4}-\d{2}-\d{2})', file_name)
  films['Weekend Commencing'] = date_match.group(1)

  return films

In [6]:
# Obtain list of box office figures file names
film_dfs = []
# The path for the folder where the box office figure xls files are stored
folder_path = '/workspaces/box-office-figures/data'

# Iterate through the files in the folder
for file_name in os.listdir(folder_path):
  film_dfs.append(film_dataframe_prep(folder_path, file_name))

In [7]:
all_films = pd.concat(film_dfs, ignore_index=True)

In [8]:
all_films

Unnamed: 0,Rank,Film,Country of Origin,Weekend Gross,Distributor,% change on last week,Weeks on release,Number of cinemas,Site average,Total Gross to date,Source File,Weekend Commencing
0,1.0,It Ends With Us,USA,4516760,Sony Pictures,-,1.0,625.0,7227.0,4516760.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
1,2.0,Deadpool & Wolverine,UK/USA,4083378,Disney,-0.49,3.0,713.0,5727.0,42986728.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
2,3.0,Despicable Me 4,USA,1528941,Universal,-0.4,5.0,718.0,2129.0,35552993.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
3,4.0,Trap,USA,1141334,Warner Bros,-,1.0,526.0,2170.0,1141334.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
4,5.0,Borderlands,USA,843159,Lionsgate,-,1.0,540.0,1561.0,843159.0,bfi-weekend-box-office-report-2024-08-09-11.xls,2024-08-09
...,...,...,...,...,...,...,...,...,...,...,...,...
122,23.0,Mr. Bachchan,Ind,12316,Dreamz Entertainment,,1.0,16.0,770.0,12316.0,bfi-weekend-box-office-report-2024-08-16-18.xls,2024-08-16
123,27.0,Nunakuzhi (Ireland),Ind,10067,2G Entertainments,,1.0,7.0,1438.0,10067.0,bfi-weekend-box-office-report-2024-08-16-18.xls,2024-08-16
124,29.0,Lone Star (4K Restoration),USA,9731,Park Circus,,1.0,18.0,541.0,9731.0,bfi-weekend-box-office-report-2024-08-16-18.xls,2024-08-16
125,33.0,Ryan's World The Movie: Titan Universe Adventure,USA,8828,Vue Entertainment,,1.0,85.0,104.0,8828.0,bfi-weekend-box-office-report-2024-08-16-18.xls,2024-08-16
