In [None]:
import os, json
import pandas as pd, numpy as np 

os.chdir('/home/jovyan/work/')
indatadir = os.path.join('personas_code', 'data_ext')

### language detection

In [None]:
!pip install langdetect
from langdetect import detect_langs

load personas

In [None]:
pd_personas = pd.read_pickle(os.path.join(indatadir, 'personas.pkl'))

run langdetect

In [None]:
%%time
all_langs, first_lang, first_conf = [], [], []

for i,row in pd_personas.iterrows():
    try:
        all_langs.append(detect_langs(row['persona']))
    except:
        all_langs.append(None)

extract predicted languages and probability scores

In [None]:
first_lang = [l[0].lang if l else None for l in all_langs]
first_conf = [l[0].prob if l else None for l in all_langs]

pd_personas['lang'] = first_lang
pd_personas['conf'] = first_conf

pd_personas['all_langs'] = all_langs

In [None]:
pd_personas.to_pickle(os.path.join(indatadir, 'persona_languages.pkl'))

manually explore predicted and actual languages

In [None]:
pd_personas = pd.read_pickle(os.path.join(indatadir, 'persona_languages.pkl'))

languages classified as english with low confidence are not a problem; even if the confidence is lower than 0.5 and the first classified language is english, the samples are still english

In [None]:
len(pd_personas[(pd_personas['lang']=='en')&(pd_personas['conf']<0.5)])

In [None]:
pd_personas[(pd_personas['lang']=='en')&(pd_personas['conf']<0.5)].head()

languages classified as non-english are a problem; there are some english personas between them

In [None]:
pd_foreign = pd_personas[pd_personas['lang']!='en']

In [None]:
len(pd_foreign)

all personas that are zh-cn, ko, ru, zh-tw, ja and th are correctly identified as clearly non-english

In [None]:
len(pd_foreign[(pd_foreign['lang']=='zh-cn')])

In [None]:
len(pd_foreign[(pd_foreign['lang']=='ko')])

In [None]:
len(pd_foreign[(pd_foreign['lang']=='ru')])

In [None]:
len(pd_foreign[(pd_foreign['lang']=='zh-tw')])

In [None]:
len(pd_foreign[(pd_foreign['lang']=='ja')])

In [None]:
len(pd_foreign[(pd_foreign['lang']=='th')])

In [None]:
pd_foreign = pd_foreign[~pd_foreign['lang'].isin(['zh-cn','zh-tw','ko','ru','ja','th'])]

2000 candidates for falsely flagged as non-english left

In [None]:
len(pd_foreign)

all those with language ro are english

In [None]:
len(pd_foreign[pd_foreign['lang']=='ro'])

In [None]:
pd_foreign[(pd_foreign['lang']=='ro')].sample(5)

In [None]:
pd_keep = pd.DataFrame()

In [None]:
pd_keep = pd_foreign[(pd_foreign['lang']=='ro')]

In [None]:
pd_foreign = pd_foreign[(pd_foreign['lang']!='ro')]

In [None]:
len(pd_keep)

In [None]:
len(pd_foreign)

all those with confidence lower < 0.7 are english

In [None]:
len(pd_foreign[pd_foreign['conf']<0.7])

In [None]:
pd_foreign[pd_foreign['conf']<0.7].sample(5)

In [None]:
pd_keep = pd.concat([pd_keep, pd_foreign[pd_foreign['conf']<0.7]])

In [None]:
pd_foreign = pd_foreign[pd_foreign['conf']>=0.7]

In [None]:
len(pd_keep)

In [None]:
len(pd_foreign)

all those with english as second detected language are english

In [None]:
pd_foreign['second'] = [l[1].lang if len(l) > 1 else None for l in pd_foreign['all_langs']]

In [None]:
len(pd_foreign[pd_foreign['second']=='en'])

In [None]:
pd_foreign[pd_foreign['second']=='en'].sample(5)

In [None]:
pd_keep = pd.concat([pd_keep, pd_foreign[pd_foreign['second']=='en'].drop('second', axis=1)])

In [None]:
pd_foreign = pd_foreign[pd_foreign['second']!='en']

In [None]:
len(pd_keep)

In [None]:
len(pd_foreign)

export remaining 965 instances for manual annotation

In [None]:
pd_foreign = pd_foreign.sort_values('conf', ascending=True)

In [None]:
pd_foreign.to_excel(os.path.join(indatadir, 'manual_check_lang.xlsx'))

import annotated data

In [None]:
pd_foreign = pd.read_excel(os.path.join('manual_check_lang.xlsx', index_col=0))

In [None]:
len(pd_foreign)

In [None]:
len(pd_foreign[pd_foreign['keep']==1])

In [None]:
pd_keep = pd.concat([pd_keep, pd_foreign[pd_foreign['keep']==1]])

In [None]:
len(pd_keep)

re-initialize pd_foreign and drop all in pd_keep

In [None]:
pd_foreign = pd_personas[pd_personas['lang']!='en']

In [None]:
len(pd_foreign[pd_foreign['personaId'].isin(pd_keep['personaId'])])

remove those that are to be kept

In [None]:
pd_to_delete = pd_foreign[~pd_foreign['personaId'].isin(pd_keep['personaId'])]

In [None]:
len(pd_to_delete)

In [None]:
pd_to_delete.sample(25)

remove non-english instances from pd_personas

In [None]:
pd_personas = pd.read_pickle(os.path.join(indatadir, 'personas.pkl'))

In [None]:
pd_personas = pd_personas[~pd_personas['personaId'].isin(pd_to_delete['personaId'])]

In [None]:
len(pd_personas)

In [None]:
pd_personas['personaIndex'] = pd_personas.index
pd_personas = pd_personas.reset_index(drop=True)
pd_personas = pd_personas[['personaId','personaIndex','persona']]
pd_personas = pd_personas.astype({'personaId': int, 'personaIndex': int})
pd_personas.head()

In [None]:
pd_personas.to_pickle(os.path.join(indatadir, 'pd_personas_cleaned.pkl'))

In [None]:
from llama_cpp import Llama
import outlines
import gc

In [None]:
def create_prompt(persona):
    return f'<TASK> Your task is to decide whether the persona described in the following text is human or non-human, e.g., an animal or an object. Respond with "human" if the described persona is human and "non-human" if the described persona is non-human. <PERSONA> {persona}. <RESPONSE> '

In [None]:
def create_long_prompt(persona):
    return f'<TASK> Your task is to decide whether the persona described in the following text is an individual human or something else. Respond with "human" if the described persona is human, "animal" if the described persona is an animal, "object" if the described persona is an object, "institution or group" if the described persona is an institution or a group of individuals, and "other" if the described persona is any other type of non-human entity. <PERSONA> {persona}. <RESPONSE> '

In [None]:
def get_prediction(prompt, generator, verbose = False):
    answer = generator(prompt)
    if verbose == True:
        print(prompt)
        print(answer)
    return answer   

def load_llm(model_, choices):
    llm = Llama(
        model_path = 'models/'+model_[1],
        n_gpu_layers = -1, 
        # n_ctx = 2048, 
        logits_all = False,
        verbose = False)
    model = outlines.models.LlamaCpp(llm)
    generator = outlines.generate.choice(model, choices)
    return generator

def annotate_data(prompts, generator):
    llm_labels = []
    for prompt in prompts:
        llm_labels.append(get_prediction(prompt, generator))
    return llm_labels

In [None]:
model_dir = 'ggml-mistral-model-f16.gguf'
model_name = 'Mistral7B'

model_ = (model_name, model_dir)

choices_binary = ['TRUE', 'FALSE']
choices_scale = ['1', '2', '3', '4', '5']
choices_human = ['non-human', 'human']
choices_other = ['human', 'animal', 'object', 'institution or group', 'other']

generator_human = load_llm(model_, choices_other)
# generator_scale = load_llm(model_, choices_scale)

In [None]:
list_personas = list(pd_personas['persona'])

In [None]:
%time list_verdicts = [get_prediction(create_long_prompt(persona), generator_human) for persona in list_personas[:1000]]

In [None]:
pd_test = pd.DataFrame(list_personas[:1000], columns=['persona'])

In [None]:
pd_test['verdict'] = list_verdicts

In [None]:
pd_test.to_excel('HUMAN_TEST_NEW.xlsx')

In [None]:
pd_test

In [None]:
get_prediction(create_prompt('a biology teacher at the local high school'), generator_human)

In [None]:
get_prediction(create_prompt('a bot-like researcher working on automation and autonomy'), generator_human)

In [None]:
get_prediction(create_prompt('a brazilian jumping spider afraid of rats'), generator_human)

In [None]:
get_prediction(create_prompt('a bi-sexual unicorn with a peanut kink'), generator_human)

In [None]:
get_prediction(create_prompt('a intelligent bot able to converse with humans'), generator_human)