# Drake Davis
# Project Proposal Jupyter Notebook

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import random
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from datetime import datetime, timedelta


# Box Office Scraping

In [None]:
# Setting up selenium
driver = webdriver.Chrome()

In [None]:
# Here is where im setting up my URL and setting it to pull the last 100 days

base_url = "https://www.the-numbers.com/box-office-chart/daily/"
today = datetime.today()
daily_urls = []
for i in range(100):
    date = today - timedelta(days=i)
    formatted_date = date.strftime("%Y/%m/%d")
    full_url = base_url + formatted_date
    daily_urls.append(full_url)


In [None]:
# To compare to stock data, I need to make sure the producer/studio has a publicly traded stock. This is limiting the scraping to my selected studios

target_studios = {
    "Disney", "Netflix", "AMC", "Sony", "Warner", "Paramount", "Lionsgate",
    "Amazon", "MGM", "Apple"
}
all_movies = []


In [None]:
for url in daily_urls:
    driver.get(url)
    time.sleep(3)  

    date = "/".join(url.split("/")[-3:])  
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")

    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) >= 11:
            try:
                title = cells[2].text.strip()
                studio = cells[3].text.strip()

                # Daily Gross
                gross_str = cells[4].text.strip().replace('$', '').replace(',', '')
                daily_gross = float(gross_str) if gross_str else 0

                # % Change
                pct_change_str = cells[5].text.strip().replace('%', '').replace(',', '').replace('−', '-')
                percent_change = float(pct_change_str) if pct_change_str.replace('.', '', 1).lstrip('-').isdigit() else None

                # Theaters
                theaters_str = cells[7].text.strip().replace(',', '')
                theaters = int(theaters_str) if theaters_str.isdigit() else None

                # Per Theater
                per_theater_str = cells[8].text.strip().replace('$', '').replace(',', '')
                per_theater = float(per_theater_str) if per_theater_str else None

                # Total Gross
                total_gross_str = cells[9].text.strip().replace('$', '').replace(',', '')
                total_gross = float(total_gross_str) if total_gross_str else None

                # Days in Release
                days_str = cells[10].text.strip().replace(',', '')
                days_in_release = int(days_str) if days_str.isdigit() else None

                # Only include the studios I want
                if any(target.lower() in studio.lower() or studio.lower() in target.lower() for target in target_studios):
                    all_movies.append({
                        "date": date,
                        "title": title,
                        "studio": studio,
                        "daily_gross": daily_gross,
                        "percent_change": percent_change,
                        "theaters": theaters,
                        "per_theater": per_theater,
                        "total_gross": total_gross,
                        "days_in_release": days_in_release
                    })
# having issues, need to see the error message
            except Exception as e:
                print(f"Skipping row {e}")

driver.quit()

In [11]:
df = pd.DataFrame(all_movies)
print(df)
df.to_excel("Box_office_data.xlsx", index=False)

            date                       title         studio  daily_gross  \
0     2025/04/07           A Minecraft Movie   Warner Bros.   10000000.0   
1     2025/04/07               A Working Man  Amazon MGM S…     591163.0   
2     2025/04/07         Disney’s Snow White    Walt Disney     409310.0   
3     2025/04/07  Captain America: Brave Ne…    Walt Disney     127731.0   
4     2025/04/07                   Mickey 17   Warner Bros.      72000.0   
...          ...                         ...            ...          ...   
1429  2024/12/30                 Nickel Boys  Amazon MGM S…      11791.0   
1430  2024/12/30          The Room Next Door  Sony Picture…      11131.0   
1431  2024/12/30                  Better Man  Paramount Pi…       6189.0   
1432  2024/12/30                 September 5  Paramount Pi…       5990.0   
1433  2024/12/30       Venom: The Last Dance  Sony Pictures       2936.0   

      percent_change  theaters  per_theater  total_gross  days_in_release  
0          

# Stock Data Collection

In [1]:
import yfinance as yf

In [None]:
# Weird scenerio where MGM is owned by Amazon occurs

studio_tickers = {
    "Disney": "DIS",
    "Netflix": "NFLX",
    "AMC": "AMC",
    "Sony": "SONY",          
    "Warner": "WBD",          
    "Paramount": "PARA",
    "Lionsgate": "LGF-A",     
    "Amazon": "AMZN",
    "MGM": "AMZN",            
    "Apple": "AAPL"
}

In [None]:
end_date = datetime.today()
start_date = end_date - timedelta(days=140)  

In [5]:
all_data = []


In [None]:
# Grabbing data, print statements to help look for problems I am having

for studio, ticker in studio_tickers.items():
    try:
        df = yf.download(ticker, start=start_date.strftime('%Y-%m-%d'), end=end_date.strftime('%Y-%m-%d'))
        if df.empty:
            print("Problem here")
            continue
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = [col[0] for col in df.columns]
        required_cols = {'Open', 'High', 'Low', 'Close', 'Volume'}
        if not required_cols.issubset(df.columns):
            print(f"Problem with my columns")
            continue
        df = df.dropna(subset=['Close'])
        df.reset_index(inplace=True)
        df['Studio'] = studio
        df['Ticker'] = ticker
        all_data.append(df[['Date', 'Studio', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume']])
    except Exception as e:
        print(f"error for {studio}: {e}")

if all_data:
    stock_df = pd.concat(all_data, ignore_index=True)
    stock_df.to_excel("Studio_Stock_Prices.xlsx", index=False)
else:
    print("No stock data")

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed



# Cleaning/Merging/Integration

In [47]:
# Loading in the files
stock_df = pd.read_excel("Studio_Stock_Prices.xlsx")
box_df = pd.read_excel("Box_office_data.xlsx")

In [48]:
# Convert date columns to datetime
stock_df["Date"] = pd.to_datetime(stock_df["Date"])
box_df["date"] = pd.to_datetime(box_df["date"])

In [None]:
# mapping based on ownership, synonyms

studio_map = {
    "Walt Disney Studios": "Disney",
    "Buena Vista": "Disney",
    "Sony Pictures": "Sony",
    "Warner Bros.": "Warner",
    "Amazon Studios": "Amazon",
    "Apple TV+": "Apple",
    "MGM": "Amazon"
}

In [50]:
box_df["studio"] = box_df["studio"].replace(studio_map)

In [51]:
# Merging the dataframes on date and studio
merged_df = pd.merge(
    box_df,
    stock_df,
    how="inner",
    left_on=["date", "studio"],
    right_on=["Date", "Studio"]
)

In [None]:
# Dropping duplicate merge columns
merged_df.drop(columns=["Date", "Studio"], inplace=True)



KeyError: "['Date', 'Studio'] not found in axis"

In [55]:
deduplicated_df = merged_df.drop_duplicates()

In [57]:
deduplicated_df.to_excel("Merged_BoxOffice_Stock.xlsx", index=False)
deduplicated_df.head()

Unnamed: 0,date,title,studio,daily_gross,percent_change,theaters,per_theater,total_gross,days_in_release,Ticker,Open,High,Low,Close,Volume
0,2025-04-07,A Minecraft Movie,Warner,10000000,-78.0,4263.0,2346.0,172753003,4.0,WBD,7.8,8.79,7.65,8.09,58591500
3,2025-04-07,Mickey 17,Warner,72000,-66.0,643.0,112.0,45295175,32.0,WBD,7.8,8.79,7.65,8.09,58591500
6,2025-04-07,Paddington in Peru,Sony,12836,-83.0,402.0,32.0,45634908,53.0,SONY,20.42,22.190001,20.42,21.24,10610300
9,2025-04-04,A Minecraft Movie,Warner,57113324,,4263.0,13397.0,57113324,1.0,WBD,8.85,8.86,8.03,8.07,57367600
12,2025-04-04,Mickey 17,Warner,244816,,643.0,381.0,44687933,29.0,WBD,8.85,8.86,8.03,8.07,57367600
