### Loading data

In [1]:
import sys
import os

module_path = os.path.abspath(os.path.join("..", ".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
PATH_TO_ROOT = '../..'

In [5]:
%load_ext autoreload
%autoreload 2

import json
import pandas as pd

with open(f'{PATH_TO_ROOT}/src/data/trope2characters.json', 'r') as file: # https://pralav.github.io/emnlp_personas/ (Character Trope Desciption Dataset)
    t2c = json.load(file)

for k, v in t2c.items():
    t2c[k] = list(map(lambda x : json.loads(x) , v))


with open(f'{PATH_TO_ROOT}/src/data/MovieSummaries/character.metadata.tsv', 'r') as file:
    character_meta_df = pd.read_csv(file, sep='\t', names = ['idx1', 'idx2', 'date', 'name', 'idk', 'gender', 'idk2', 'idk4', 'name2', 'idk3', 'char_idx', 'idk5', 'idk6'])


idx_to_summary = {}
with open(f'{PATH_TO_ROOT}/src/data/MovieSummaries/plot_summaries.txt', 'r') as file:
    for line in file.readlines():
        i = line.index('\t')
        idx = int(line[:i])
        idx_to_summary[idx] = line[i+1:]

persona_identification_df = []

for persona, characters in t2c.items():
    for character in characters:
        movie_name = character['movie']
        char_info = character_meta_df.loc[character_meta_df.char_idx == character['id']]
        ds_idx = char_info.idx1.item()
        summary = idx_to_summary[ds_idx]
        persona_identification_df.append({
            'character_name': character['char'].strip(),
            'movie_name': movie_name.strip(),
            'persona': persona.strip(),
            'movie_summary': summary.strip()
        })

persona_identification_df = pd.DataFrame(persona_identification_df)
persona_identification_df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unnamed: 0,character_name,movie_name,persona,movie_summary
0,Jessica Rabbit,Who Framed Roger Rabbit,chanteuse,"In 1947, cartoon characters, commonly called ""..."
1,Tina Carlyle,The Mask,chanteuse,Shy and luckless clerk Stanley Ipkiss works at...
2,Susie Diamond,The Fabulous Baker Boys,chanteuse,"The Fabulous Baker Boys, Jack and Frank , are..."
3,Sugar Kane Kowalczyk,Some Like It Hot,chanteuse,It is February 1929 in the city of Chicago. Tw...
4,Dorothy Vallens,Blue Velvet,chanteuse,Jeffrey Beaumont returns to his logging home ...
...,...,...,...,...
496,Lt. Commander Ron Hunter,Crimson Tide,father_to_his_men,"In post-Soviet Russia, military units loyal to..."
497,Christopher Pike,Star Trek,father_to_his_men,"In 2233, the Federation starship USS Kelvin is..."
498,Zordon,Mighty Morphin' Power Rangers: The Movie,father_to_his_men,"The Power Rangers, Adam , Kimberly , Billy , A..."
499,Wolfstan,Black Death,father_to_his_men,The story takes place in 1348 in plague-ridden...


In [6]:
from sklearn.model_selection import train_test_split

gc = persona_identification_df.groupby('persona').count()
frequent_personas = gc[gc.character_name > 2].index
persona_identification_df = persona_identification_df[persona_identification_df.persona.isin(frequent_personas)]

train_df, test_df = train_test_split(persona_identification_df, stratify=persona_identification_df.persona, test_size=0.3)
val_df, test_df = train_test_split(persona_identification_df, test_size = 0.25)

In [7]:
#test_df.to_csv('persona_identification_testset.csv')
#train_df.to_csv('persona_identification_train.csv')
#val_df.to_csv('persona_identification_val.csv')

### Actual training

In [65]:
test_df = pd.read_csv('persona_identification_testset.csv')
train_df = pd.read_csv('persona_identification_train.csv')
val_df = pd.read_csv('persona_identification_val.csv')

In [67]:
from src.models.persona_identification import persona_capitalize

unique_personas = list(train_df.persona.unique())
unique_personas = list(map(persona_capitalize, unique_personas))
unique_personas[:4]

['Stupid Crooks', 'Egomaniac Hunter', 'Stoner', 'Byronic Hero']

In [74]:
from src.models.persona_identification import PersonaIdentification
from langchain_core.prompts import PromptTemplate

SYSTEM_PROMPT = """You are a movie expert specializing in character archetypes. You are tasked with identifying the most appropriate archetype for the provided character. 
Choose from the list of the following archetypes:

"""
SYSTEM_PROMPT += '\n'.join(unique_personas) + '\n'

template = "\nMovie name:\n{movie_name}\n\nMovie summary:\n{movie_summary}\n\nQuestion:\nWith which of the provided archetypes does \"{character_name}\" align best?\n"
prompt_template = PromptTemplate(
    input_variables=["movie_name", "movie_summary", "character_name"],
    template=template
)

prompts = prompt_template.batch(train_df.to_dict(orient='records'))

def make_train(df):
    training_data = []

    for i, (_, x) in enumerate(df.iterrows()):
        training_data.append({
            'system_prompt': SYSTEM_PROMPT,
            'text_input': prompts[i].text,
            'output': persona_capitalize(x['persona'])
        })
    return training_data

training_data = make_train(train_df)
print(training_data[0]['system_prompt'], end='')
print(training_data[0]['text_input'], end='')
print(training_data[0]['output'], end='')

You are a movie expert specializing in character archetypes. You are tasked with identifying the most appropriate archetype for the provided character. 
Choose from the list of the following archetypes:

Stupid Crooks
Egomaniac Hunter
Stoner
Byronic Hero
Crazy Jealous Guy
The Chief
Charmer
Valley Girl
Prima Donna
Corrupt Corporate Executive
Heartbroken Badass
Trickster
Retired Outlaw
Revenge
Arrogant Kungfu Guy
Father To His Men
Consummate Professional
Bruiser With A Soft Center
Playful Hacker
Hitman With A Heart
Fastest Gun In The West
Casanova
Henpecked Husband
Big Man On Campus
Dumb Blonde
Surfer Dude
Granola Person
Hardboiled Detective
Warrior Poet
Grumpy Old Man
Bully
Master Swordsman
Bromantic Foil
Gentleman Thief
Brainless Beauty
Jerk Jock
Final Girl
Broken Bird
Ditz
Evil Prince
Psycho For Hire
Slacker
Ophelia
Eccentric Mentor
Drill Sargeant Nasty
Cultured Badass
Doormat
Tranquil Fury
Romantic Runnerup
Coward
Loser Protagonist
Adventurer Archaeologist
Gadgeteer Genius
Chanteuse


In [76]:
import json

def change_data_format(example):
    system_prompt = example['system_prompt']
    model_input = example['text_input']
    model_output = example['output']
    return {
        "systemInstruction": {
            "role": "system",
            "parts": [
                {
                    "text": system_prompt
                }
            ]
        },
        'contents': [
            {
                'role': 'user',
                'parts': [{'text': model_input}]
            },
            {
                'role': 'model',
                'parts': [{'text': model_output}]
            }
        ]
    }

train_data = [change_data_format(ex) for ex in training_data]
val_data = [change_data_format(ex) for ex in make_train(val_df)]

def dump_jsonl(data, path):
    with open(path, 'w+') as file:
        for example in data:
            file.write(json.dumps(example))
            file.write('\n')

dump_jsonl(train_data, f'persona_train.jsonl')
dump_jsonl(val_data, 'persona_val.jsonl') ## <--- this is uploaded to google cloud's vertexAi for training in the web interface