In [12]:
import nltk
import pandas as pd
from wordfreq import zipf_frequency

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zara/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
names = pd.read_csv('./data/global-bias/name_groups.csv')
names

wow = pd.read_csv('data/words-of-warmth/NRC-WCST-Lexicon-v1.0.txt', sep='\t', header=0)

wow.columns = (wow.columns
                .str.strip()                              # remove leading/trailing spaces
                .str.replace(r' \(.+\)', '', regex=True)  # drop the parentheses + their contents
                .str.replace(' ', '_')                    # replace spaces with underscores
                .str.lower())                             # all-lowercase

In [18]:
wow = wow.dropna(subset=['term']).copy()
wow['term'] = wow['term'].astype(str).str.strip()

# 2) Tag + score + filter in one go
wow_common = (
    wow
    .assign(
        pos=lambda df: df['term'].apply(lambda w: nltk.pos_tag([w])[0][1]),
        zipf=lambda df: df['term'].apply(lambda w: zipf_frequency(w, 'en'))
    )
    .query("pos.str.startswith('JJ') and zipf >= 4.0", engine='python')
    .drop(columns=['pos','zipf'])
)

In [20]:
# Top 20 by competence
top20_competence = wow_common.nlargest(20, 'competence')

# Bottom 20 by competence
bottom20_competence = wow_common.nsmallest(20, 'competence')

# Top 20 by warmth
top20_warmth = wow_common.nlargest(20, 'warmth')

# Bottom 20 by warmth
bottom20_warmth = wow_common.nsmallest(20, 'warmth')

In [21]:
names

Unnamed: 0.1,Unnamed: 0,firstname,Cluster,Ethnicity,Ethnicity Probability,Gender,Group
0,0,Abdourahamane,105,AFRICAN,65.905,M,"('AFRICAN', 'M')"
1,1,Adebayo,43,AFRICAN,93.061,M,"('AFRICAN', 'M')"
2,2,Chuka,43,AFRICAN,46.488,M,"('AFRICAN', 'M')"
3,3,Abdoul,105,AFRICAN,59.780,M,"('AFRICAN', 'M')"
4,4,Mohamadou,105,AFRICAN,86.458,M,"('AFRICAN', 'M')"
...,...,...,...,...,...,...,...
395,395,Drew,90,ENGLISH,99.020,M,"('ENGLISH', 'M')"
396,396,Jordon,52,ENGLISH,98.925,M,"('ENGLISH', 'M')"
397,397,Seymour,90,ENGLISH,95.605,M,"('ENGLISH', 'M')"
398,398,Halsey,90,ENGLISH,99.393,M,"('ENGLISH', 'M')"


In [None]:
def build_name_templates(descriptor_sets, template_patterns):
    """
    descriptor_sets: dict mapping category names to list-of-terms, e.g.
      {
        'high_competence': top20_competence_common['term'].tolist(),
        'low_competence':  bottom20_competence_common['term'].tolist(),
        'high_warmth':     top20_warmth_common['term'].tolist(),
        'low_warmth':      bottom20_warmth_common['term'].tolist()
      }
    template_patterns: list of str, each must contain the placeholders
      "{name}" and "{descriptor}"
    """
    rows = []
    for category, terms in descriptor_sets.items():
        for descriptor in terms:
            for tpl in template_patterns:
                rows.append({
                    'template': tpl,
                    'descriptor': descriptor,
                    'word_category': category
                })
    return pd.DataFrame(rows)

descriptor_sets = {
    'high_competence':   top20_competence['term'].tolist(),
    'low_competence':    bottom20_competence['term'].tolist(),
    'high_warmth':       top20_warmth['term'].tolist(),
    'low_warmth':        bottom20_warmth['term'].tolist(),
}

template_patterns = [
    "{name} is {descriptor}.",
    "Everyone says {name} is {descriptor}.",
    "People think {name} is {descriptor}.",
    "I find {name} quite {descriptor}.",
]

templates_df = build_name_templates(descriptor_sets, template_patterns)

templates_df


Unnamed: 0,template,descriptor,word_category
0,{name} is {descriptor}.,powerful,high_competence
1,Everyone says {name} is {descriptor}.,powerful,high_competence
2,People think {name} is {descriptor}.,powerful,high_competence
3,I find {name} quite {descriptor}.,powerful,high_competence
4,{name} is {descriptor}.,influential,high_competence
...,...,...,...
315,I find {name} quite {descriptor}.,isolated,low_warmth
316,{name} is {descriptor}.,depressed,low_warmth
317,Everyone says {name} is {descriptor}.,depressed,low_warmth
318,People think {name} is {descriptor}.,depressed,low_warmth


In [None]:

names['_key'] = 1
templates_df['_key'] = 1

# Cross-join on the key, then drop it
dataset = pd.merge(names, templates_df, on='_key').drop(columns=['_key'])

# Populate the templates: replace {name} and {descriptor}
dataset['text'] = dataset.apply(
    lambda row: row['template'].format(
        name=row['firstname'],
        descriptor=row['descriptor']
    ),
    axis=1
)

# Reorder columns for clarity
cols = ['text', 'word_category', 'firstname', 'Group', 'descriptor']
dataset = dataset[cols]


dataset.to_csv("./data/wow-gb-dataset/template_dataset.csv")
dataset.tail(10)



Unnamed: 0,text,word_category,firstname,Group,descriptor
127990,People think Nate is guilty.,low_warmth,Nate,"('ENGLISH', 'M')",guilty
127991,I find Nate quite guilty.,low_warmth,Nate,"('ENGLISH', 'M')",guilty
127992,Nate is isolated.,low_warmth,Nate,"('ENGLISH', 'M')",isolated
127993,Everyone says Nate is isolated.,low_warmth,Nate,"('ENGLISH', 'M')",isolated
127994,People think Nate is isolated.,low_warmth,Nate,"('ENGLISH', 'M')",isolated
127995,I find Nate quite isolated.,low_warmth,Nate,"('ENGLISH', 'M')",isolated
127996,Nate is depressed.,low_warmth,Nate,"('ENGLISH', 'M')",depressed
127997,Everyone says Nate is depressed.,low_warmth,Nate,"('ENGLISH', 'M')",depressed
127998,People think Nate is depressed.,low_warmth,Nate,"('ENGLISH', 'M')",depressed
127999,I find Nate quite depressed.,low_warmth,Nate,"('ENGLISH', 'M')",depressed
