In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from nlp_funcs import load_glove, tokenizer, wv_centroid, wv_dis


## Import Data 

In [4]:
df = pd.read_csv('/Users/lidanyang/Desktop/intensional-and-extensional-category-convergence/data/final_def_data_FULL.csv', encoding='cp1252', na_values=['nan'])
df.head()

Unnamed: 0,N,instance_id,label,node_id,total_success,success_rate,label_freq_rank,definition,raw_image_breadth,norm_image_breadth,num_imgs,min_img,max_img,num_adopters,prop_adopters
0,2,400,space,15245,44,1.0,1,Gap,16030,16030,22,996,1393,2,1.0
1,2,400,space,15414,44,1.0,1,A gap within a continuous object,16030,16030,22,996,1393,2,1.0
2,2,400,wide,15245,16,0.888889,3,Lengthy,31482,31482,8,61,685,2,1.0
3,2,400,wide,15414,16,0.888889,3,A shape that spans furthest left to right.,31482,31482,8,61,685,2,1.0
4,2,400,x,15245,20,0.833333,2,Cross,162606,86604,10,31,909,2,1.0


In [3]:
df.describe()

Unnamed: 0,N,instance_id,node_id,total_success,success_rate,label_freq_rank,raw_image_breadth,norm_image_breadth,num_imgs,min_img,max_img,num_adopters,prop_adopters
count,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0,1134.0
mean,12.751323,520.986772,37235.179894,70.643739,0.906012,2.0,20782.041446,8526.291887,32.259259,524.970018,924.404762,11.560847,0.925705
std,8.295081,144.731546,24887.843865,49.026085,0.065958,0.816857,58777.547346,11859.383334,20.576505,324.392412,312.027789,7.405974,0.104294
min,2.0,392.0,13451.0,8.0,0.647059,1.0,232.0,232.0,4.0,1.0,142.0,2.0,0.541667
25%,6.0,413.0,14732.0,36.0,0.873016,1.0,3432.0,2094.0,17.0,229.0,751.0,6.0,0.875
50%,8.0,429.5,15839.5,52.0,0.916667,2.0,5588.0,3541.0,24.0,518.0,864.0,8.0,1.0
75%,24.0,716.0,65913.0,86.0,0.947368,3.0,9247.0,8112.75,41.0,736.0,1173.0,20.0,1.0
max,24.0,728.0,72888.0,244.0,1.0,3.0,431587.0,86604.0,104.0,1351.0,1499.0,24.0,1.0


In [4]:
# unique value of a column
columns_name = df.columns
indexs_name = df.index
def cal_unique(df, column_name):
    labels, counts = np.unique(df[column_name], return_counts=True)
    return labels, counts

In [5]:
# return a dict which associate (label : a list of definitions)
def extract_def(df, labels):
    def_dict = {}
    for label in labels:
        def_values = df[df['label'] == label].definition.values
        if len(def_values) > 0:
            def_dict[label] = (def_values)
    return def_dict

In [6]:
# return: dict key: 
# n_group value: subdf based on group
def get_data(pwd):
    df = pd.read_csv(pwd, encoding='cp1252')
    # 1. split on the instance_id
    n_uni, n_counts = cal_unique(df, 'instance_id') 
    df_n = [df[df['instance_id'] == n] for n in n_uni]
    # 2. split on labels
    label_list, label_counts = cal_unique(df, 'label')
    # Structure: 
    # one df_sub = dict{label:([ def1, def2, def3...], success_rate)}
    # whole dict -> different dicts
    df_dict = {}
    for idx_n, df_sub in enumerate(df_n):
        key = n_uni[idx_n]
        df_dict[key] = extract_def(df_sub, label_list)
    return df_dict

## Calculate Distance

In [7]:
# For each label in each social group (instance_id), 
# get the word embedding of each word in the definition,  
# and calculate the average pairwise embedding between the definitions of each subject.

if __name__ == '__main__':
    # data_dir = sys.argv[1]
    data_dir = '/Users/lidanyang/Desktop/intensional-and-extensional-category-convergence/data/final_def_data_FULL.csv'
    df_dict = get_data(data_dir)
    # glove_dir = sys.argv[2]
    glove_dir = '/Users/lidanyang/Desktop/intensional-and-extensional-category-convergence/data'
    gloves = load_glove(glove_dir)

    summay = []
    for instance_id in df_dict.keys():
        df_instance_id = df_dict[instance_id]
        for label in df_instance_id.keys():
            label_defs = df_instance_id[label]
            vec = []
            for label_def in label_defs:
                # skip nan value
                try:
                    good_words = tokenizer(label_def)
                except BaseException:
                    continue
                try:
                    centriod_vector = wv_centroid(good_words, gloves)
                    vec.append(centriod_vector)
                except:
                    continue
            count, dis = 0, 0
            for v1, v2 in combinations(vec, 2):
                count += 1
                dis += wv_dis(v1, v2)
            avg_dis = dis / count
            for label_def, v in zip(label_defs, vec):
                summay.append((instance_id, label, avg_dis))

    df_dis = pd.DataFrame(summay,columns=['instance_id','label','distance'])


  return v_sum / count


In [39]:
dfa = df[['instance_id','label','success_rate','total_success','raw_image_breadth','norm_image_breadth','num_imgs','N']].drop_duplicates()
dfb = df_dis.drop_duplicates()
df2 = pd.merge(dfa, dfb, on=['instance_id','label'])
df2.to_csv('FULL_dis.csv', index=True)
df2

                             

Unnamed: 0,instance_id,label,success_rate,total_success,raw_image_breadth,norm_image_breadth,num_imgs,N,distance
0,400,space,1.000000,44,16030,16030,22,2,4.398674
1,400,wide,0.888889,16,31482,31482,8,2,
2,400,x,0.833333,20,162606,86604,10,2,6.553489
3,401,anthea,1.000000,16,3452,3452,8,2,1.890510
4,401,crab,1.000000,16,1954,1954,8,2,1.970918
...,...,...,...,...,...,...,...,...,...
144,432,crab,0.948718,74,14408,12625,36,8,4.696151
145,432,kiss,0.791667,38,11246,11246,19,8,4.349132
146,433,crab,0.833333,50,2492,2492,25,8,6.726773
147,433,flat,0.880000,44,4211,4211,20,8,


In [37]:
#null value
df2[df2.distance.isnull()]

Unnamed: 0,instance_id,label,success_rate,total_success,raw_image_breadth,norm_image_breadth,num_imgs,N,distance
1,400,wide,0.888889,16,31482,31482,8,2,
20,407,kickb,1.0,16,3755,3755,8,2,
23,409,woman,1.0,10,574,574,5,2,
27,716,bunny,0.767857,86,44816,44816,41,24,
29,716,frog,0.916667,110,5758,2235,52,24,
34,718,frog,0.929577,132,4152,1494,59,24,
35,718,rabbit,0.928571,52,2469,2469,24,24,
56,414,lift,0.821429,46,8366,2301,23,6,
70,419,crab,0.970588,66,8755,5838,31,6,
127,426,kissin,0.888889,64,8498,8498,31,8,


## Correlation

Average distance (inversed intensional consensus); Average coordination success; Overall diversity of related images

In [44]:
df_corr = df2[['distance', 'success_rate','total_success','raw_image_breadth','norm_image_breadth','num_imgs','N']]
df_corr.corr()

Unnamed: 0,distance,success_rate,total_success,raw_image_breadth,norm_image_breadth,num_imgs,N
distance,1.0,0.049803,-0.009844,0.248103,-0.066966,-0.012041,0.008952
success_rate,0.049803,1.0,-0.085103,-0.101507,-0.296449,-0.084295,-0.175949
total_success,-0.009844,-0.085103,1.0,-0.07465,-0.095346,0.994097,0.82039
raw_image_breadth,0.248103,-0.101507,-0.07465,1.0,0.369116,-0.073101,-0.008161
norm_image_breadth,-0.066966,-0.296449,-0.095346,0.369116,1.0,-0.093374,-0.076318
num_imgs,-0.012041,-0.084295,0.994097,-0.073101,-0.093374,1.0,0.816513
N,0.008952,-0.175949,0.82039,-0.008161,-0.076318,0.816513,1.0


In [24]:
df_corr_2 = df_corr[df_corr['N'] == 2]
df_corr_2.corr()

Unnamed: 0,distance,success_rate,total_success,raw_image_breadth,norm_image_breadth,num_imgs,N
distance,1.0,0.0663,-0.140185,0.476256,0.06135,-0.140185,
success_rate,0.0663,1.0,-0.178357,-0.156217,-0.684486,-0.178357,
total_success,-0.140185,-0.178357,1.0,-0.070764,0.106544,1.0,
raw_image_breadth,0.476256,-0.156217,-0.070764,1.0,0.524619,-0.070764,
norm_image_breadth,0.06135,-0.684486,0.106544,0.524619,1.0,0.106544,
num_imgs,-0.140185,-0.178357,1.0,-0.070764,0.106544,1.0,
N,,,,,,,


In [25]:
df_corr_6 = df_corr[df_corr['N'] == 6]
df_corr_6.corr()

Unnamed: 0,distance,success_rate,total_success,raw_image_breadth,norm_image_breadth,num_imgs,N
distance,1.0,0.096524,-0.149833,0.21616,-0.142739,-0.161531,
success_rate,0.096524,1.0,0.290001,-0.244338,-0.211362,0.272363,
total_success,-0.149833,0.290001,1.0,-0.120633,-0.244675,0.992115,
raw_image_breadth,0.21616,-0.244338,-0.120633,1.0,0.267166,-0.147229,
norm_image_breadth,-0.142739,-0.211362,-0.244675,0.267166,1.0,-0.261583,
num_imgs,-0.161531,0.272363,0.992115,-0.147229,-0.261583,1.0,
N,,,,,,,


In [26]:
df_corr_8 = df_corr[df_corr['N'] == 8]
df_corr_8.corr()

Unnamed: 0,distance,success_rate,total_success,raw_image_breadth,norm_image_breadth,num_imgs,N
distance,1.0,0.010293,-0.012533,0.147998,-0.153598,0.006712,
success_rate,0.010293,1.0,0.110624,0.008537,-0.401944,0.140701,
total_success,-0.012533,0.110624,1.0,-0.094607,-0.026614,0.953074,
raw_image_breadth,0.147998,0.008537,-0.094607,1.0,0.297665,-0.07387,
norm_image_breadth,-0.153598,-0.401944,-0.026614,0.297665,1.0,0.015107,
num_imgs,0.006712,0.140701,0.953074,-0.07387,0.015107,1.0,
N,,,,,,,


In [27]:
df_corr_24 = df_corr[df_corr['N'] == 24]
df_corr_24.corr()

Unnamed: 0,distance,success_rate,total_success,raw_image_breadth,norm_image_breadth,num_imgs,N
distance,1.0,0.457499,0.105454,-0.121587,-0.487909,0.078637,
success_rate,0.457499,1.0,0.028347,-0.212164,-0.72863,-0.002706,
total_success,0.105454,0.028347,1.0,-0.228318,-0.034264,0.995301,
raw_image_breadth,-0.121587,-0.212164,-0.228318,1.0,0.441745,-0.196571,
norm_image_breadth,-0.487909,-0.72863,-0.034264,0.441745,1.0,0.006466,
num_imgs,0.078637,-0.002706,0.995301,-0.196571,0.006466,1.0,
N,,,,,,,
