# Notebook for GPT API request to retrieve domestic and international boxoffice

In [None]:
API_KEY = "token here"

In [2]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import pandas as pd
import ast

In [3]:
df = pd.read_csv("../../data/CMU_Other_dataset.csv")
df.head()

Unnamed: 0,item,title_x,pub_date,originLabel,worldwide,Year,Genre_x,title_year,Wikipedia_movie_ID,Freebase_movie_ID,title_y,Runtime,Languages,Genre_y
0,http://www.wikidata.org/entity/Q106982223,#dogpoopgirl,2021-01-01T00:00:00Z,Romania,20470.0,2022.0,Comedy,#dogpoopgirl2022,,,,,,
1,http://www.wikidata.org/entity/Q4004245,"'night, Mother",1986-01-01T00:00:00Z,United States of America,441863.0,1986.0,,"'night,mother1986",,,,,,
2,http://www.wikidata.org/entity/Q232106,'Til There Was You,1997-01-01T00:00:00Z,United States of America,3525125.0,1997.0,,'tiltherewasyou1997,17841580.0,/m/047dvdb,'Til There Was You,114.0,['English Language'],"['Romantic comedy', 'Romance Film', 'Comedy']"
3,,,,United States of America,60722734.0,2009.0,,(500)daysofsummer2009,18057739.0,/m/047msdk,(500) Days of Summer,95.0,"['French Language', 'Swedish Language', 'Engli...","['Romantic comedy', 'Indie', 'Comedy-drama', '..."
4,http://www.wikidata.org/entity/Q72096489,(Nie)znajomi,2019-01-01T00:00:00Z,Poland,1975617.0,2019.0,Comedy,(nie)znajomi2019,,,,,,


In [285]:
# List of countries to analyze
countries = ['United States of America', 'Canada', 'Germany', 'United Kingdom', 
             'France', 'Australia', 'New Zealand', 'Japan', 'China', 
             'Hong Kong', 'India', 'South Korea', 'South Africa', 'Mexico', 
             'Nigeria', 'Argentina']

In [286]:
df = df.loc[(df['originLabel'].isin(countries)) & (df['Year'] > 1960) & ~(df.item.isna())]
df['title'] = df.apply(lambda row: row['title_x'] if pd.notna(row['title_x']) else row['title_y'], axis=1)
df = df.drop(columns=['title_x', 'title_y'])
df.drop_duplicates(subset=['title'], keep='first', inplace=True)
df.head(1)

Unnamed: 0,item,pub_date,originLabel,worldwide,Year,Genre_x,title_year,Wikipedia_movie_ID,Freebase_movie_ID,Runtime,Languages,Genre_y,title
1,http://www.wikidata.org/entity/Q4004245,1986-01-01T00:00:00Z,United States of America,441863.0,1986.0,,"'night,mother1986",,,,,,"'night, Mother"


In [287]:
question = """Locate the domestic and international box office values for the movie: {title} released in {year}.
Your response should be in the exact format:
[<domestic box office value>, <international box office value>]

Instructions:
1. **Find and extract only the numeric box office values** for both domestic and international totals.
2. **Exclude currency symbols and commas** and convert values into integers for readability in Python.
3. Format your answer exactly like this example: [10000000, 20000000] for domestic and international totals of $10,000,000 and $20,000,000.
4. **Do not add any extra text** or explanation outside the format. Your answer must be strictly numeric for further Python processing."""

In [288]:
class Model:
    def __init__(self, api_key, model_name):
        self.model = ChatOpenAI(
            model=model_name,
            openai_api_key=api_key,
            temperature=0
        )

        self.template = """Question: {question}"""
        self.prompt = ChatPromptTemplate.from_template(self.template)
    
    def request(self, question, title, year):
        formatted_prompt = self.prompt.format_prompt(
            question=question.format(title=title, year=year)
        ).to_messages()
        response = self.model.invoke(formatted_prompt)
        return response

model = Model(api_key=API_KEY, model_name="gpt-4o")

In [289]:
items = []
domestic = []
international = []

for el in df.iterrows():
    try:
        title = el[1]["title"]
        year = el[1]["Year"]
        resp = model.request(question, title, year)
        try:
            resp = list(resp)[0][1]
            resp_list = ast.literal_eval(resp)
            if resp_list[0] == 0 and resp_list[1] == 0: continue
            domestic.append(resp_list[0])
            international.append(resp_list[1])
            items.append(el[1]["item"])
        except Exception as e:
            pass
    except:
        pass

In [290]:
data = {
    'items': items,
    'domestic': domestic,
    'international': international
}

df_new = pd.DataFrame(data)

filename = 'dataset_with_domestic_boxoffice.csv'

df_new.to_csv(filename, index=False)

print(df_new.shape)

(9445, 3)
