In [2]:
from langchain_openai import ChatOpenAI
import pandas as pd
import asyncio
import json
import os
from tqdm import tqdm
import nest_asyncio

nest_asyncio.apply()

class GPTAnalyzer:
    def __init__(self, api_keys_path='MovieSummaries/api_keys.json'):
        with open(api_keys_path, 'r') as f:
            api_keys = json.load(f)
            
        self.gpt4_model = ChatOpenAI(
            model="gpt-4o-mini",
            api_key=api_keys['openai_key'],
            temperature=0
        )
        
        self.temp_file = 'MovieSummaries/temp_gpt_results.tsv'
        self.final_file = 'MovieSummaries/character_metadata_with_gpt.tsv'
        self.plot_file = 'MovieSummaries/plot_summaries.txt'
        self.plot_summaries = self._load_plot_summaries()
    
    def _load_plot_summaries(self):
        summaries = {}
        with open(self.plot_file, 'r', encoding='utf-8') as f:
            for line in f:
                wiki_id, summary = line.strip().split('\t', 1)
                summaries[wiki_id] = summary
        return summaries
    
    def _load_existing_results(self):
        if os.path.exists(self.temp_file):
            return pd.read_csv(self.temp_file, sep='\t')
        return None
    
    def _create_prompt(self, movie_name, plot_summary, all_actors, target_actor):
        prompt = f"""
        Movie: {movie_name}
        Plot Summary: {plot_summary}
        All actors in the movie: {', '.join(all_actors)}
        
        Is the actor "{target_actor}" a main character in this movie?
        Please only answer with "True" if they are a main character, or "False" if they are not.
        ANSWER WITH ONLY TRUE OR FALSE.
        """
        return prompt
    
    def extract_decision(self, response):
        response = response.lower()
        has_true = 'true' in response
        has_false = 'false' in response
        
        if (has_true and has_false) or (not has_true and not has_false):
            return None
        return has_true
    
    async def process_single_character(self, row, movie_name, plot_summary, all_actors):
        try:
            prompt = self._create_prompt(movie_name, plot_summary, all_actors, row['Actor name'])
            gpt_response = await asyncio.to_thread(self.gpt4_model.predict, prompt)
            gpt_response = gpt_response.strip().lower()
            gpt_decision = self.extract_decision(gpt_response)
            return row.name, gpt_decision
            
        except Exception as e:
            print(f"Error processing {row['Actor name']} in {movie_name}: {str(e)}")
            return row.name, None

    async def process_movie_batch(self, movie_group, wiki_id_str, batch_semaphore):
        async with batch_semaphore:
            movie_name = movie_group['Movie name'].iloc[0]
            plot_summary = self.plot_summaries[wiki_id_str]
            all_actors = movie_group['Actor name'].tolist()
            
            tasks = []
            for _, row in movie_group.iterrows():
                if pd.isna(row['gpt_decision']):
                    tasks.append(self.process_single_character(row, movie_name, plot_summary, all_actors))
            
            if tasks:
                return await asyncio.gather(*tasks)
            return []

    async def analyze_characters_async(self, character_data, max_concurrent=32, batch_size=100):
        existing_results = self._load_existing_results()
        if existing_results is not None:
            character_data = pd.concat([
                character_data,
                existing_results[['Wikipedia movie ID', 'Actor name', 'gpt_decision']]
            ]).drop_duplicates(subset=['Wikipedia movie ID', 'Actor name'], keep='last')
        
        if 'gpt_decision' not in character_data.columns:
            character_data['gpt_decision'] = None
        
        movie_groups = character_data.groupby('Wikipedia movie ID')
        batch_semaphore = asyncio.Semaphore(max_concurrent)
        
        progress_bar = tqdm(total=len(character_data))
        count = 0
        
        for wiki_id, group in movie_groups:
            wiki_id_str = str(wiki_id)
            if wiki_id_str not in self.plot_summaries:
                continue
            
            results = await self.process_movie_batch(group, wiki_id_str, batch_semaphore)
            
            for idx, decision in results:
                character_data.loc[idx, 'gpt_decision'] = decision
                count += 1
                
                if count % batch_size == 0:
                    character_data.to_csv(self.temp_file, sep='\t', index=False)
            
            progress_bar.update(len(group))
        
        progress_bar.close()
        character_data.to_csv(self.final_file, sep='\t', index=False)
        return character_data

def main():
    character_data = pd.read_csv('MovieSummaries/character_metadata_with_movies.tsv', sep='\t')
    analyzer = GPTAnalyzer()
    
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
    
    result_data = loop.run_until_complete(analyzer.analyze_characters_async(character_data))
    
    print("Analysis completed!")
    print(f"Total characters processed: {len(result_data)}")
    print(f"Characters with GPT decision: {result_data['gpt_decision'].notna().sum()}")

if __name__ == "__main__":
    main()

 32%|███▏      | 33080/103071 [00:03<00:10, 6874.60it/s]

Error processing Margarita Terekhova in nan: sequence item 1: expected str instance, float found
Error processing nan in nan: sequence item 1: expected str instance, float found
Error processing Valentin Smirnitsky in nan: sequence item 1: expected str instance, float found
Error processing Irina Alfyorova in nan: sequence item 1: expected str instance, float found
Error processing Lev Durov in nan: sequence item 1: expected str instance, float found
Error processing Veniamin Smekhov in nan: sequence item 1: expected str instance, float found
Error processing Alisa Freindlich in nan: sequence item 1: expected str instance, float found
Error processing Igor Starygin in nan: sequence item 1: expected str instance, float found
Error processing Oleg Tabakov in nan: sequence item 1: expected str instance, float found


 76%|███████▌  | 78274/103071 [00:11<00:03, 6525.08it/s]


Analysis completed!
Total characters processed: 103071
Characters with GPT decision: 78265
