In [None]:
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
from scipy import stats
from scipy.spatial.distance import cosine as csim
from statsmodels.stats import anova
from statsmodels.formula.api import ols

In [None]:
def sim(a,b):
    if a.shape != b.shape:
        raise ValueError("ERROR: Input shape mismatch a:"+a.shape+", b:"+b.shape)
    
    if len(a.shape) ==2:        
        return np.sum(a*b, axis=1)/(np.linalg.norm(a,2,1)*np.linalg.norm(b,2,1))
#         return [csim(x,y) for x,y in zip(a,b)]
    elif len(a.shape)==3:
        return np.sum(a*b, axis=2)/(np.linalg.norm(a,2,2)*np.linalg.norm(b,2,2))
    else: 
        raise ValueError("ERROR, Input shape too large")

In [None]:
data = {}

with open('./data/homonymy_vecs.pkl','rb') as f:
    data['h'] = pickle.load(f)
    
with open('./data/polysemy_vecs.pkl','rb') as f:
    data['p']= pickle.load(f)
    
with open('/unambiguous_vecs.pkl','rb') as f:
    data['u'] = pickle.load(f)
    
key2word = {'h':'Homonymy', 'p':'Polysemy', 'u':'Unambiguous', 's':'Artificial', 'n':'Natural'}
keys = data.keys()
key2color = {'nh':'#910616', 'sh':"#ff122a", 'np':'#0c4700', 'sp':'#10c944', 'nu':'#013485', 'su':'#2CBFF5'}
key2marker =  {'nh':"^", 'sh':"v", 'np':'P', 'sp':"X", 'nu':'s', 'su':'D'}

In [None]:
# axes = []
# figures = []
# split_data = {}

stand_sim = {}
nat_sim = {}
# in_t = []

bins = int(1/.05)
rng = (.5,1)
alpha = .5

x= np.linspace(.5,1,100)

for d in keys:
    terms = []
    vecs = []

    for k,v in data[d].items():
        terms.append(k)
        vecs.append(v)

    vecs = np.array(vecs)
    na = vecs[:,0:3].mean(1)
    nb = vecs[:,3:6].mean(1)
#     na = vecs[:,0+int(np.random.uniform(0,1)*3)]
#     nb = vecs[:,3+int(np.random.uniform(0,1)*3)]
    sa = vecs [:, 6]
    sb = vecs [:, 7]
    
    
    stand, nat = sim(sa,sb), sim(na,nb)
    stand_sim[d]=stand
    nat_sim[d]=nat
    
#     fig,ax = plt.subplots()
       
    
#     nx,ny = np.histogram(nat,bins,rng)
#     ny=np.array([(ny[i]+ny[i+1])/2 for i in range(len(ny)-1)])
#     ny = ny[nx!=0]
#     nx = nx[nx!=0]
    
#     sx,sy = np.histogram(stand,bins,rng)
#     sy=np.array([(sy[i]+sy[i+1])/2 for i in range(len(sy)-1)])
#     sy = sy[sx!=0]
#     sx = sx[sx!=0]
    
    
#     ax.scatter(ny,nx,color=key2color['n'+d],marker=key2marker['n'+d],alpha=alpha,label="Natural")
#     ax.plot(x,stats.norm.pdf(x,nat.mean(),nat.std()), color=key2color['n'+d])
    
#     ax.scatter(sy,sx,color=key2color['s'+d], marker=key2marker['s'+d], alpha=alpha,label="Artificial")
#     ax.plot(x,stats.norm.pdf(x,stand.mean(),stand.std()), color=key2color['s'+d])
    
#     ax.set_ylabel("Count")
#     ax.set_xlabel("Cosine Similarity")
#     ax.set_title(key2word[d]+": Artificial vs. Natural Contexts")
#     ax.legend()
    
#     fig.savefig(key2word[d]+'_within_comp.png')
#     axes.append(ax)
#     figures.append(fig)
    
#     in_t.append(stats.ttest_ind(stand,nat, 0, False))

In [None]:
ex_t = []
# comp_data = {"s":stand_sim, "n":nat_sim}
comp_data={"n":nat_sim}


for d in comp_data.keys():
    fig,ax = plt.subplots()
    
    for k in keys:
        # Get Points of Distribution
        xx,yy= np.histogram(comp_data[d][k],bins,rng)
        yy=np.array([(yy[i]+yy[i+1])/2 for i in range(len(yy)-1)])
        yy = yy[xx!=0]
        xx = xx[xx!=0]

        ax.scatter(yy,xx,color=key2color[d+k],marker=key2marker[d+k],\
                   alpha=alpha,label=key2word[k][:5])
        ax.plot(x,stats.norm.pdf(x,comp_data[d][k].mean(),comp_data[d][k].std()),\
                color=key2color[d+k], alpha=.95)
    
    
    ax.set_ylabel("Count")
    ax.set_xlabel("Cosine Similarity")
    ax.set_title(key2word[d]+" Averaged Context")
    ax.legend()
    fig.savefig(key2word[d]+"_between_avg.png")

In [None]:
# df = pd.DataFrame([], columns=["Context", "Word", "Sim"])
df = pd.DataFrame([], columns=["Word", "Sim"])

for c in comp_data.keys():
    for k in keys:        
#         df=df.append(pd.DataFrame([{'Context':c,'Word':k,'Sim':sim} \
#                    for sim in comp_data[c][k]]), True)
        df=df.append(pd.DataFrame([{'Word':k,'Sim':sim} \
                   for sim in comp_data[c][k]]), True)

In [None]:
df.to_csv('sim_data_avg.csv', index=False)

In [None]:
#     ax.hist(stand, color='blue', edgecolor='black', bins=bins,\
#             range=rng, alpha=alpha, label="Art Cos Sim")
#     ax.hist(nat, color='red', edgecolor='black', bins=bins, \
#             range=rng, alpha=alpha, label="Nat Cos Sim")

#     ax.axvline(x=stand.mean(), color='blue', label="Art {}".format('\u03BC'))
#     ax.axvline(x=nat.mean(), color='red',label="Nat {}".format('\u03BC'))

#     ax.axvline(x=stand.mean()+stand.std(),linestyle='--',\
#                color='blue', alpha=alpha*1.5, label="Art {}".format(u'\u03BC\u00B1\u03C3'))
#     ax.axvline(x=stand.mean()-stand.std(),linestyle='--', color='blue', alpha=alpha*1.5)
#     ax.axvline(x=nat.mean()+nat.std(),linestyle='--', \
#                color='red', alpha=alpha*1.5,label="Nat {}".format(u'\u03BC\u00B1\u03C3'))
#     ax.axvline(x=nat.mean()-nat.std(),linestyle='--', color='red', alpha=alpha*1.5)