In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# with open("data/all_vec_dfs.pickle", "rb") as handle:
#     all_vec_dfs = pickle.load(handle)

with open("data/all_vector_dfs.pickle", "rb") as handle:
    all_vec_dfs = pickle.load(handle)

with open("data/dict_of_ys.pickle", "rb") as handle:
    dict_of_ys = pickle.load(handle)

Sudeep suggested we remove SWOW_RW because it's not derived from text, and the title of our paper refers to semantic representations from text.

In [3]:
all_vec_dfs.pop('swow_rw', None);

In [4]:
vector_types = list(all_vec_dfs.keys())
vector_types

['word2vec',
 'fasttext',
 'glove',
 'elmo_decontext',
 'elmo_context',
 'paragram',
 'glove_postspec',
 'bert_decontext',
 'bert_context']

In [5]:
all_vec_dfs['word2vec'].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
capable,-0.021387,0.035702,0.065195,-0.027423,-0.109693,0.072439,0.030873,0.003234,-0.03139,0.041049,...,0.002619,0.020438,-0.041393,-0.03001,-0.136598,0.100034,-0.080027,-0.039496,-0.028113,0.043291
daring,0.033601,0.023445,0.010584,0.008685,0.065684,-0.134405,0.029235,0.03455,0.043853,0.054673,...,-0.041575,0.019743,-0.090363,-0.00032,-0.083149,-0.094919,0.044802,0.024299,-0.022876,0.120737
sad,0.065318,0.015824,0.023063,-0.015403,0.061614,0.006776,0.119861,-0.008501,0.140063,0.036531,...,-0.050167,0.011532,0.000164,0.050503,-0.050503,0.0117,-0.018097,-0.02138,0.02845,-0.008796
cheerful,0.039474,0.085966,-0.012719,0.034562,-0.058597,-0.024562,0.101054,0.08737,0.045264,0.067369,...,-0.026141,0.004057,0.010921,-0.075791,-0.033158,0.033685,0.040702,-0.043685,0.117195,0.023158
committed,-0.064982,0.133554,0.133554,-0.030516,-0.061751,0.046672,0.087241,0.000875,0.06211,-0.019297,...,-0.033927,0.084728,-0.073598,-0.048467,-0.097652,-0.000718,-0.001806,0.049544,0.084728,0.015079


In [6]:
for vec_type, vec_df in all_vec_dfs.items():
    print(vec_type, vec_df.shape)

word2vec (2716, 300)
fasttext (2164, 300)
glove (2166, 300)
elmo_decontext (1400, 1024)
elmo_context (1400, 1024)
paragram (1059, 300)
glove_postspec (982, 300)
bert_decontext (1400, 768)
bert_context (1400, 768)


In [7]:
one_ser_per_dom = [ser for ser in dict_of_ys.values()][::2]
len(one_ser_per_dom)

7

In [8]:
old_domain_order = ['brand', 'product', 'trait', 'food', 'occupation', 'risk', 'people']

In [9]:
item_count_df = pd.DataFrame(index=old_domain_order, columns=all_vec_dfs.keys())

for domain, y_series in zip(old_domain_order, one_ser_per_dom):
    for vec_type, vec_df in all_vec_dfs.items():
        if vec_type in ['paragram','glove_postspec']:
            needed_items = pd.Series([x.lower().replace(' ','_') for x in y_series.index])
        else:
            needed_items = pd.Series([x.replace(' ','_') for x in y_series.index])
            
#         if vec_type == 'elmo_context':
        if '_context' in vec_type:
            vec_df = vec_df.loc[domain]
            count_of_items_with_vectors = sum(needed_items.isin(vec_df.index))
        else:
            count_of_items_with_vectors = sum(needed_items.isin(vec_df.index))
            
        item_count_df.loc[domain, vec_type] = count_of_items_with_vectors

item_count_df

Unnamed: 0,word2vec,fasttext,glove,elmo_decontext,elmo_context,paragram,glove_postspec,bert_decontext,bert_context
brand,199,135,135,199,199,136,89,199,199
product,200,191,191,200,200,191,185,200,200
trait,200,199,199,200,200,199,197,200,200
food,162,162,162,162,162,162,152,162,162
occupation,200,178,179,200,200,179,169,200,200
risk,200,200,200,200,200,200,198,200,200
people,197,17,17,197,197,17,17,197,197


word2vec doesn't have 200 in every domain because we've already dropped some items according to pre-reg criteria.

So we should be able to make at least decent predictions for all domains and embeddings, except perhaps brands and especially people.

For brands, we can retain the items in the glove_postspec vocabularies.

For people, since just doing word2vec and the two elmo models, don't throw any out.

For the rest, retain the number of items in the swow_rw vocabularies.

May want to reduce all the spaces so that p == n (number of features == number of items)??

In [10]:
# item_count_df.to_excel('data/item_counts_per_space.xlsx')
item_count_df.to_excel('data/item_counts_per_space_no_swow_rw.xlsx')

# Restrict items

In [11]:
new_domain_order = ['trait', 'risk', 'people', 'food', 'occupation', 'brand', 'product']

dims = [('masculine', 'feminine'),
       ('dread-inducing', 'unknowable'),
       ('warm', 'competent'),
       ('tasty', 'nutritious'),
       ('significance', 'autonomy'),
       ('sincere', 'exciting'),
       ('hedonic', 'utilitarian')]

# restricting_vec_types = ['swow_rw', 'swow_rw', 'word2vec', 'swow_rw', 'swow_rw', 'glove_postspec', 'swow_rw']
restricting_vec_types = ['glove_postspec', 'glove_postspec', 'word2vec', 'glove_postspec', 'glove_postspec', 'glove_postspec', 'glove_postspec'] # without swow_rw

In [12]:
restricting_items_dict = dict()

for domain, dim_pair, vec_type in zip(new_domain_order, dims, restricting_vec_types):
    dim = dim_pair[0] # only need to do one dim
    y_series = dict_of_ys[dim]
    vec_df = all_vec_dfs[vec_type]
    
    if vec_type in ['paragram', 'glove_postspec']:
        curr_domain_items = pd.Series([x.lower().replace(' ','_') for x in y_series.index])
    else:
        curr_domain_items = pd.Series([x.replace(' ','_') for x in y_series.index])
        
    curr_domain_items_with_vectors = curr_domain_items[curr_domain_items.isin(vec_df.index)].values
    restricting_items_dict[domain] = curr_domain_items_with_vectors

In [13]:
restricting_items_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in restricting_items_dict.items() ]))
restricting_items_df.head(10)

Unnamed: 0,trait,risk,people,food,occupation,brand,product
0,capable,car,Frank_Sinatra,stuffing,mercenary,bose,hat
1,daring,weapons,Confucius,peas,janitor,nbc,nightgown
2,sad,plane,Eric_Clapton,tripe,babysitter,pfizer,skirt
3,cheerful,scammers,Judy_Garland,semolina,psychologist,firestone,ink
4,committed,tobacco,Julia_Child,lemons,groundskeeper,facebook,ski
5,grateful,lightning,Roald_Dahl,avocado,dancer,palmolive,photography
6,depressed,gun,Lyndon_B._Johnson,oranges,programmer,samsung,apple
7,arrogant,riot,Frida_Kahlo,goulash,baker,toyota,paperclip
8,dedicated,pollution,Abraham_Lincoln,flapjacks,professor,pontiac,rowboat
9,dignified,gas,Marie_Antoinette,bacon,priest,kotex,tylenol


In [14]:
print(list(restricting_items_df['brand']))

['bose', 'nbc', 'pfizer', 'firestone', 'facebook', 'palmolive', 'samsung', 'toyota', 'pontiac', 'kotex', 'hollister', 'aol', 'nabisco', 'gillette', 'nestle', 'volvo', 'honda', 'chanel', 'espn', 'netflix', 'downy', 'meijer', 'nickelodeon', 'ajax', 'huggies', 'fresca', 'charmin', 'fanta', 'cheerios', 'jergens', 'mazda', 'volkswagen', 'maytag', 'reebok', 'google', 'subaru', 'walmart', 'merck', 'oshkosh', 'heinz', 'jello', 'hyundai', 'cheetos', 'ferrari', 'pbs', 'costco', 'ihop', 'lexus', 'advil', 'clinique', 'pampers', 'sony', 'dell', 'nike', 'lego', 'microsoft', 'disney', 'audi', 'cbs', 'prada', 'nba', 'rca', 'chevrolet', 'clorox', 'kmart', 'sephora', 'bbc', 'yahoo', 'gerber', 'purina', 'gatorade', 'nascar', 'msn', 'armani', 'wii', 'snickers', 'itunes', 'hp', 'lipton', 'iphone', 'suzuki', 'visa', 'pepsi', 'sears', 'maybelline', 'tampax', 'smirnoff', 'ipod', 'hbo', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 

In [15]:
restricting_items_df.count()

trait         197
risk          198
people        197
food          152
occupation    169
brand          89
product       185
dtype: int64

# Modeling

In [16]:
import warnings
from multiprocessing import Pool
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
import numpy as np
from sklearn.preprocessing import normalize

warnings.simplefilter("once")

In [17]:
old_domain_order

['brand', 'product', 'trait', 'food', 'occupation', 'risk', 'people']

In [18]:
new_domain_order

['trait', 'risk', 'people', 'food', 'occupation', 'brand', 'product']

In [19]:
dims

[('masculine', 'feminine'),
 ('dread-inducing', 'unknowable'),
 ('warm', 'competent'),
 ('tasty', 'nutritious'),
 ('significance', 'autonomy'),
 ('sincere', 'exciting'),
 ('hedonic', 'utilitarian')]

In [20]:
normalize(all_vec_dfs['word2vec'])

array([[-0.0213866 ,  0.0357019 ,  0.06519471, ..., -0.0394963 ,
        -0.0281131 ,  0.0432907 ],
       [ 0.0336013 ,  0.023445  ,  0.0105835 , ...,  0.0242993 ,
        -0.0228755 ,  0.1207369 ],
       [ 0.0653178 ,  0.0158244 ,  0.0230632 , ..., -0.0213798 ,
         0.0284503 , -0.008796  ],
       ...,
       [ 0.0357937 ,  0.0180345 ,  0.0202372 , ...,  0.0564439 ,
        -0.14647891, -0.0007185 ],
       [-0.063179  , -0.0072393 , -0.0418423 , ..., -0.0220295 ,
         0.0689982 , -0.003966  ],
       [-0.0638335 , -0.0517655 ,  0.0330283 , ...,  0.0209603 ,
         0.0527182 ,  0.0276294 ]])

In [21]:
flattened_dims = [dim for x in dims for dim in x]
print(flattened_dims)

['masculine', 'feminine', 'dread-inducing', 'unknowable', 'warm', 'competent', 'tasty', 'nutritious', 'significance', 'autonomy', 'sincere', 'exciting', 'hedonic', 'utilitarian']


In [22]:
restricting_items_df.head(10)

Unnamed: 0,trait,risk,people,food,occupation,brand,product
0,capable,car,Frank_Sinatra,stuffing,mercenary,bose,hat
1,daring,weapons,Confucius,peas,janitor,nbc,nightgown
2,sad,plane,Eric_Clapton,tripe,babysitter,pfizer,skirt
3,cheerful,scammers,Judy_Garland,semolina,psychologist,firestone,ink
4,committed,tobacco,Julia_Child,lemons,groundskeeper,facebook,ski
5,grateful,lightning,Roald_Dahl,avocado,dancer,palmolive,photography
6,depressed,gun,Lyndon_B._Johnson,oranges,programmer,samsung,apple
7,arrogant,riot,Frida_Kahlo,goulash,baker,toyota,paperclip
8,dedicated,pollution,Abraham_Lincoln,flapjacks,professor,pontiac,rowboat
9,dignified,gas,Marie_Antoinette,bacon,priest,kotex,tylenol


In [23]:
list_of_params = [{'alpha':10**x} for x in range(-2,8)]

In [24]:
n_test_train_splits = 25 # TO TEST THIS CODE MORE QUICKLY, SET THIS VALUE CLOSER TO 5 OR 10

In [25]:
def rmse_score(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [26]:
def many_test_train_splits(parameters):
    rsquareds = []
    rmses = []
    # copy vecs_and_judgment because each process of this function will modify it,
    # and changing the original might lead to unwanted behavior
    # I think the behavior can be replaced by the randomization of train_test_split, but I think I 
    # was getting weird behavior with that....
    vecs_and_judgment_temp = vecs_and_judgment.copy(deep=True)
#     vecs_and_judgment_temp.dropna(inplace=True) # this will only drop smartphone for glove_postspec!
    for _ in range(n_test_train_splits):
        vecs_and_judgment_temp = vecs_and_judgment_temp.sample(frac=1)
        X = vecs_and_judgment_temp.iloc[:,:-1]
        y = vecs_and_judgment_temp['judgment']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        regression = Ridge(**parameters)
        regression.fit(X=X_train, y=y_train)
        y_pred = regression.predict(X=X_test)
        rsquared = r2_score(y_test, y_pred)
        rmse     = rmse_score(y_test, y_pred)
        
        rsquareds.append(rsquared)
        rmses.append(rmse)
    mean_rsquared = np.mean(rsquareds)
    mean_rmses = np.mean(rmses)
    return mean_rsquared, mean_rmses

In [27]:
# all_vec_dfs.keys()

In [28]:
# all_vec_dfs = {'bert_decontext': all_vec_dfs['bert_decontext'],
#                'bert_context':   all_vec_dfs['bert_context'],
#                }

In [None]:
for vec_type, vec_df in all_vec_dfs.items():
    vec_df.drop_duplicates(inplace=True) # I really should do this when we *get* the vectors....
    print(vec_type)
    model_results = []
#     if vec_type != 'glove_postspec':
#         continue
#     if vec_type not in ['elmo_decontext', 'elmo_context']:
#         continue
    for domain, dim_pair in zip(new_domain_order, dims):
        print('\t', domain)
        if ((vec_type =='swow_rw' and domain in ['brand', 'people']) or
           (vec_type not in ['word2vec','elmo_context','elmo_decontext','bert_context','bert_decontext'] and domain == 'people')):
            # need to just return some place-holder for the eventual dataframe 
            # since there's not enough data to make it worth building models in these cases
            not_enough_place_holder = [('not_enough_vectors', 'not_enough_vectors')] * len(list_of_params)
            model_results.append(not_enough_place_holder)
            model_results.append(not_enough_place_holder)  # must put in one place holder for each dimension in this domain
            continue
        
#         if vec_type == 'elmo_context':
        if '_context' in vec_type:
            modifiable_vec_df = vec_df.loc[domain].copy()
        else:
            modifiable_vec_df = vec_df.copy()

        curr_restricted_items    = restricting_items_df[domain].dropna().apply(str.lower).values
        modifiable_vec_df.index  = modifiable_vec_df.index.map(str.lower)
        restricted_vec_df        = modifiable_vec_df.loc[curr_restricted_items]
        restricted_vec_df.dropna(inplace=True)
        restricted_vec_df = pd.DataFrame(normalize(restricted_vec_df), index=restricted_vec_df.index, columns=restricted_vec_df.columns)
        
        for dim in dim_pair:
            ys = dict_of_ys[dim]
            ys.index = ys.index.map(lambda x: x.lower().replace(' ','_'))
            ys = ys.to_frame()
            vecs_and_judgment = pd.merge(left=restricted_vec_df, right=ys, left_index=True, right_index=True)
            with Pool() as p:
                scores_by_hyperparam = [mean_scores for mean_scores in p.map(many_test_train_splits, list_of_params)]
            model_results.append(scores_by_hyperparam)
    model_results = pd.DataFrame(data=model_results, index=flattened_dims, columns=list_of_params).T
    rsquared_df = model_results.applymap(lambda x: x[0])
    rmse_df     = model_results.applymap(lambda x: x[1])

    rsquared_df.to_csv(f'results/preregistered_models_diff_embeddings/rsquared/{vec_type}_all_judgments.csv', float_format='%.2f')
    rmse_df.to_csv(    f'results/preregistered_models_diff_embeddings/rmse/{vec_type}_all_judgments.csv',     float_format='%.2f')

# Leave-one-out predicted vs actual correlations

In [29]:
from ast import literal_eval
from sklearn.model_selection import LeaveOneOut
from scipy.stats import pearsonr

In [30]:
# all_vec_dfs = {'fasttext': all_vec_dfs['fasttext']
#                }

In [31]:
# Skip people and brands now

new_domain_order = ['trait', 'risk', 'food', 'occupation', 'product']

dims = [('masculine', 'feminine'),
       ('dread-inducing', 'unknowable'),
       ('tasty', 'nutritious'),
       ('significance', 'autonomy'),
       ('hedonic', 'utilitarian')]

flattened_dims = [dim for x in dims for dim in x]

In [32]:
all_vec_type_results = []

for vec_type, vec_df in all_vec_dfs.items():
    prereg_cv_results = pd.read_csv(f'results/preregistered_models_diff_embeddings/rsquared/{vec_type}_all_judgments.csv', index_col=0)
    best_hyperparam = prereg_cv_results.mean(axis=1).idxmax()
    
    vec_df.drop_duplicates(inplace=True) # I really should do this when we *get* the vectors....
    print(vec_type)
    curr_vec_type_results = []
    for domain, dim_pair in zip(new_domain_order, dims):
        
        if '_context' in vec_type:
            modifiable_vec_df = vec_df.loc[domain].copy()
        else:
            modifiable_vec_df = vec_df.copy()
                  
        curr_restricted_items    = restricting_items_df[domain].dropna().apply(str.lower).values
        modifiable_vec_df.index  = modifiable_vec_df.index.map(str.lower)
        restricted_vec_df        = modifiable_vec_df.loc[curr_restricted_items]
        restricted_vec_df.dropna(inplace=True)
        restricted_vec_df = pd.DataFrame(normalize(restricted_vec_df), index=restricted_vec_df.index, columns=restricted_vec_df.columns)
        
        for dim in dim_pair:
            ys = dict_of_ys[dim]
            ys.index = ys.index.map(lambda x: x.lower().replace(' ','_'))
            ys = ys.to_frame()
            vecs_and_judgment = pd.merge(left=restricted_vec_df, right=ys, left_index=True, right_index=True)

            ridge = Ridge(**literal_eval(best_hyperparam))
            loo = LeaveOneOut()
            
            X = vecs_and_judgment.drop('judgment', axis='columns').values
            y = vecs_and_judgment['judgment']
            
            y_preds = np.zeros(shape=len(y))
            for train_index, test_index in loo.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
        
                ridge.fit(X=X_train, y=y_train)
                
                y_pred = ridge.predict(X=X_test)[0]
                y_preds[test_index] = y_pred
            r, _ = pearsonr(y_preds, y)                
                
            curr_vec_type_results.append(r)
    all_vec_type_results.append(curr_vec_type_results)

all_vec_type_results = pd.DataFrame(all_vec_type_results, columns=flattened_dims, index=all_vec_dfs.keys())

word2vec
fasttext
glove
elmo_decontext
elmo_context
paragram
glove_postspec
bert_decontext
bert_context


In [33]:
all_vec_type_results

Unnamed: 0,masculine,feminine,dread-inducing,unknowable,tasty,nutritious,significance,autonomy,hedonic,utilitarian
word2vec,0.744364,0.803668,0.88618,0.869653,0.670392,0.843523,0.830329,0.81612,0.836901,0.777157
fasttext,0.76117,0.815241,0.875286,0.835785,0.71677,0.865361,0.793513,0.837047,0.809798,0.755734
glove,0.803073,0.85151,0.882312,0.853384,0.706478,0.87848,0.83289,0.805618,0.832507,0.784699
elmo_decontext,0.72466,0.756377,0.90016,0.863206,0.61231,0.741526,0.82533,0.787053,0.788508,0.761295
elmo_context,0.688046,0.750063,0.878969,0.851258,0.455575,0.665412,0.831495,0.779633,0.738257,0.658475
paragram,0.789499,0.838304,0.830493,0.809434,0.663651,0.855733,0.751094,0.668672,0.763002,0.746232
glove_postspec,0.836223,0.882729,0.765378,0.757595,0.499151,0.824946,0.759834,0.71259,0.650815,0.650016
bert_decontext,0.706098,0.719789,0.816435,0.754342,0.464481,0.525528,0.708295,0.71711,0.674476,0.663566
bert_context,0.723288,0.719834,0.799203,0.720404,0.446455,0.508184,0.648374,0.730803,0.633624,0.618687


In [34]:
# all_vec_type_results.to_csv('results/preregistered_models_diff_embeddings/out-of-sample_pred-v-actual_corrs.csv')

In [35]:
all_vec_type_results.to_csv('results/preregistered_models_diff_embeddings/out-of-sample_pred-v-actual_corrs_no_swow_rw.csv')