In [1]:
import pandas as pd
from src.utils.helpers import *
from utils.constants import *
import pickle
import glob

We used GPT 4o mini to compute 6 new columns for the dataset. The columns are:

- `cold_war_side` - What side of the cold war the country belongs to if any
- `character_western_bloc_representation` - How the character representing the western block is portrayed 
- `character_eastern_bloc_representation` - How the character representing the eastern block is portrayed
- `eastern_bloc_values` - The values represented by the eastern block
- `western_bloc_values` - The values represented by the western block
- `theme` - The theme and keywords of the movie

This will be very useful later on to grasp the context of the movies and to understand the values and themes they represent.  

In [2]:
import prompt_engineering.prompt_engineering

How we queried the model for each row of the csv `merged_movies.csv` is in the python script above.  
Let's load the output and incorporate it into the dataset.

In [3]:
output_4o = pickle.load(open(DATA_FOLDER_PREPROCESSED + "output4o.pkl", "rb"))

parsed_4o = []

def parse_gpt_answer(answer):

    def split(line):
        line = re.sub(r"^[^:]*:", "", line)
        line = line.replace(", ", ",")
        # remove lasst character if " "
        if line[-1] == " ":
            line = line[:-1]
        return re.split(r",|\n", line)

    parsed_answer = {}
    
    answer = answer.replace(" \n", "\n")
    answer = answer.replace("\n\n", "\n")
    
    # Split the answer
    answer = answer.split("\n")
    
    
    try:
        # Parse the Cold War side
        parsed_answer["cold_war_side"] = answer[0]
        # Parse the Western bloc representation
        parsed_answer["character_western_bloc_representation"] = split(answer[1])
        # Parse the Eastern bloc representation
        parsed_answer["character_eastern_bloc_representation"] = split(answer[2])
        # Parse the Western bloc values;
        parsed_answer["western_bloc_values"] = split(answer[3])
        # Parse the Eastern bloc values
        parsed_answer["eastern_bloc_values"] = split(answer[4])
        # Parse the theme of the movie
        parsed_answer["theme"] = split(answer[5])
    except:
        parsed_answer = {"cold_war_side": 'None', 
                         "character_western_bloc_representation": 'None', 
                         "character_eastern_bloc_representation": 'None', 
                         "western_bloc_values": 'None', 
                         "eastern_bloc_values": 'None', 
                         "theme": 'None'}

    return parsed_answer

for answer in output_4o:
    parsed_4o.append(parse_gpt_answer(answer))

parsed_4o[0]

{'cold_war_side': 'Western ',
 'character_western_bloc_representation': ['Joe Collins',
  'American bank security consultant',
  'resourceful',
  'clever',
  'hero archetype'],
 'character_eastern_bloc_representation': ['Dawn Divine',
  'hooker with a heart of gold',
  'cunning',
  'loyal',
  'anti-heroine archetype'],
 'western_bloc_values': ['None'],
 'eastern_bloc_values': ['Resourcefulness',
  'cleverness',
  'individualism',
  'heroism'],
 'theme': ['None']}

In [4]:
movies_df = pd.read_csv(DATA_FOLDER_PREPROCESSED + "merged_movies.csv")
movies_df = movies_df.assign(**{key: [d[key] for d in parsed_4o] for key in parsed_4o[0].keys()})
movies_df

Unnamed: 0,wikipedia_id,freebase_id,title,languages,countries,genres,keywords,release_date,runtime,plot_summary,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,4213160.0,/m/0bq8q8,$,,['Soviet Union'],"['Comedy', 'Crime', 'Drama']",,1971,119.0,"Set in Hamburg, West Germany, several criminal...",Western,"[Joe Collins, American bank security consultan...","[Dawn Divine, hooker with a heart of gold, cun...",[None],"[Resourcefulness, cleverness, individualism, h...",[None]
1,,,"$1,000 on the Black","['Italiano', 'Deutsch']","['Italy', 'Germany']",['Western'],,1966,104.0,Johnny Liston has just been released from pris...,Eastern,[None],"[Sartana, villainous, oppressive, cruel, arche...","[Johnny Liston, justice, determination, resili...","[Justice, revenge, oppressed vs. oppressor, re...","[Terror, betrayal, familial conflict, crime, r..."
2,,,"$10,000 Blood Money",,['Soviet Union'],"['Western', 'Drama']",,1967,,Hired by a Mexican landowner to rescue his dau...,,[None],[None],[None],[None],"[crime, betrayal, revenge, bounty hunter, heis..."
3,,,"$100,000 for Ringo",['Italiano'],['Italy'],"['Western', 'Drama']","['spaghetti western', 'whipping']",1965,98.0,A stranger rides into Rainbow Valley where he'...,,[None],[None],[None],[None],"[Western, Civil War, mistaken identity, treasu..."
4,,,'Anna' i wampir,,['Soviet Union'],['Crime'],,1982,,"Silesia in Poland, late 60s. Bodies of vicious...",Eastern,[None],[None],[None],[None],"[murder mystery, horror, fog, Poland, 1960s]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25616,,,Şaban Oğlu Şaban,['Türkçe'],['Turkey'],['Comedy'],,1977,90.0,No overview found.,,[None],[None],[None],[None],[None ]
25617,28379533.0,/m/0crc6s_,Šíleně smutná princezna,['Český'],['Czech Republic'],"['Comedy', 'Music', 'Family']","['fairy tale', 'wedding', 'aristocrat', 'pohád...",1968,86.0,No overview found.,,[None],[None],[None],[None],[None ]
25618,,,Убить дракона,['Pусский'],"['Russia', 'Germany']","['Drama', 'Fantasy']",,1988,123.0,"Dragon is a bloody dictator, who kills every o...",Eastern,[None],"[Lancelot, heroism, freedom, knight archetype]",[None],"[Fear, oppression, totalitarianism, resistance...","[Themes of rebellion, freedom vs. oppression, ..."
25619,,,’Round Midnight,"['English', 'Français', 'Deutsch']","['France', 'United States of America']",['Drama'],"['jazz', 'musical', 'biography']",1986,133.0,Inside the Blue Note nightclub one night in 19...,,[None],[None],[None],[None],"[Jazz, Friendship, Paris, Music, Creativity]"


In [5]:
import re

global count

count = 0

def preprocess_side(row):
    # remove all non alphanumeric characters
    try:
        row['cold_war_side'] = re.sub(r'\W+', '', row['cold_war_side'])
    except:
        row['cold_war_side'] = "None"
    
    if row["character_western_bloc_representation"] is np.nan:
        row["character_western_bloc_representation"] = ['None']
    if row["character_eastern_bloc_representation"] is np.nan:
        row["character_eastern_bloc_representation"] = ['None']
    if row["western_bloc_values"] is np.nan:
        row["western_bloc_values"] = ['None']
    if row["eastern_bloc_values"] is np.nan:
        row["eastern_bloc_values"] = ['None']
    if row["theme"] is np.nan:
        row["theme"] = ['None']
        
    # print(row["western_bloc_values"][0])
    
    if (row["character_western_bloc_representation"][0] == "None" and row["character_eastern_bloc_representation"][0] == "None") and row['cold_war_side'] != "None":
    # if (row["western_bloc_values"][0] == "None" and row["eastern_bloc_values"][0] == "None") and row['cold_war_side'] != "None":
        # print(row)
        # print("-----------------")
        global count
        count += 1
        row['cold_war_side'] = "None"
    
    return row

In [10]:
movies_df2 = movies_df.apply(preprocess_side, axis=1)
movies_df2["cold_war_side"] = movies_df2["cold_war_side"].apply(lambda x: f'"{x}"')


In [11]:
movies_df2.value_counts("cold_war_side")

cold_war_side
"None"       19560
"Western"     3142
"Eastern"     2919
Name: count, dtype: int64

In [12]:
# save data in a csv
movies_df2.to_csv(DATA_FOLDER_PREPROCESSED + "v2_movies_cleaned.csv", index=False)