In [1]:
"""
calculate lexical similarity between embeddings
"""

__author__ = 'Christin Beck'
__created__ = '20.07.2023'


import re
import os

import torch

import numpy as np
import pandas as pd

from numpy.linalg import norm
from scipy import stats

In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: NVIDIA A100-PCIE-40GB


In [3]:
embeddings_path = '../embeddings/hist-bert/'

embeddings_last = embeddings_path + 'swadesh_embeddings_last.npy'

embeddings = np.load(embeddings_last)


meta_file = embeddings_path + 'swadesh_meta.tsv'

meta = pd.read_csv(meta_file, sep='\t', quotechar='\0', encoding='utf8') #quotation marks in data (would be read in as string delimiter otherwise and there is incomplete quotation in the data)

concepts = pd.DataFrame(meta, columns=['Concept']).values.flatten().tolist() 

corpora = pd.DataFrame(meta, columns=['Corpus']).values.flatten().tolist()  




In [4]:
#calculate usage matrices for each target word for MHG, eNHG, NHG

mhg = ['rem-corralled-20161222.tsv']
enhg = ['ref-mlu.tsv', 'ref-rub.tsv', 'ref-up.tsv']
nhg = ['dta_1700-1799', 'dta_1800-1899', 'dta_1900-1999']

target_words = sorted(set(concepts))
print(target_words)


['Baum', 'Berg', 'Ei', 'Fisch', 'Frau', 'Fuß', 'Hund', 'Kopf', 'Sonne', 'Vogel']


In [5]:
#files for inter-concept similarities at each stage
sim_mhg = open('inter_simliarities_mhg.tsv', 'w')
sim_mhg.write('Similarities\tBaum\tBerg\tEi\tFisch\tFrau\tFuß\tHund\tKopf\tSonne\tVogel\n')

sim_enhg = open('inter_simliarities_enhg.tsv', 'w')
sim_enhg.write('Similarities\tBaum\tBerg\tEi\tFisch\tFrau\tFuß\tHund\tKopf\tSonne\tVogel\n')

sim_nhg = open('inter_simliarities_nhg.tsv', 'w')
sim_nhg.write('Similarities\tBaum\tBerg\tEi\tFisch\tFrau\tFuß\tHund\tKopf\tSonne\tVogel\n')

#intra-concept similarities (similarities over time)
sim_intra = open('intra_similarities_time.tsv', 'w')
sim_intra.write('Target\tME\tEN\tAVG\tt-test\tp-value\n')


32

In [6]:
avg_embeddings_mhg = []
avg_embeddings_enhg = []
avg_embeddings_nhg = []

#calculate usage matrix for each concept at each time stage
for target in target_words:
    usage_mhg = []
    usage_enhg = []
    usage_nhg = []
    for i, word in enumerate(concepts):
        if word == target:
            if corpora[i] in mhg:
                usage_mhg.append(embeddings[i])
            elif corpora[i] in enhg:
                usage_enhg.append(embeddings[i])
            elif corpora[i] in nhg:
                usage_nhg.append(embeddings[i])

    #get average embedding for each concept at each stage
    avg_emb_mhg = np.average(np.array(usage_mhg), axis=0)   
    avg_emb_enhg = np.average(np.array(usage_enhg), axis=0)
    avg_emb_nhg = np.average(np.array(usage_nhg), axis=0)
    avg_embeddings_mhg.append(avg_emb_mhg)
    avg_embeddings_enhg.append(avg_emb_enhg)
    avg_embeddings_nhg.append(avg_emb_nhg)


    
for i, target in enumerate(target_words):
    sim_mhg.write(target + '\t')
    sim_enhg.write(target + '\t')
    sim_nhg.write(target + '\t')
    
    inter_mhg_all= []
    inter_enhg_all = []
    inter_nhg_all = []
    
    #inter-concept similarity
    for j in range(0,len(target_words)):
        inter_mhg = np.dot(avg_embeddings_mhg[i],avg_embeddings_mhg[j])/(norm(avg_embeddings_mhg[i])*norm(avg_embeddings_mhg[j]))
        inter_enhg = np.dot(avg_embeddings_enhg[i],avg_embeddings_enhg[j])/(norm(avg_embeddings_enhg[i])*norm(avg_embeddings_enhg[j]))
        inter_nhg = np.dot(avg_embeddings_nhg[i],avg_embeddings_nhg[j])/(norm(avg_embeddings_nhg[i])*norm(avg_embeddings_nhg[j]))

        sim_mhg.write(str(inter_mhg) + '\t')
        sim_enhg.write(str(inter_enhg) + '\t')
        sim_nhg.write(str(inter_nhg) + '\t')
        
        inter_mhg_all.append(inter_mhg)
        inter_enhg_all.append(inter_enhg)
        inter_nhg_all.append(inter_nhg)
        
    sim_mhg.write('\n')
    sim_enhg.write('\n')
    sim_nhg.write('\n')  

    
    inter_mhg_all = np.array(inter_mhg_all)
    inter_enhg_all = np.array(inter_enhg_all)
    inter_nhg_all = np.array(inter_nhg_all)
    

    #inter average distribution
    inter_avg_distr = np.average([inter_mhg_all, inter_enhg_all, inter_nhg_all], axis=0)
 
    #intra-concept similarity
    intra_me = np.dot(avg_embeddings_mhg[i],avg_embeddings_enhg[i])/(norm(avg_embeddings_mhg[i])*norm(avg_embeddings_enhg[i]))
    intra_en = np.dot(avg_embeddings_enhg[i],avg_embeddings_nhg[i])/(norm(avg_embeddings_enhg[i])*norm(avg_embeddings_nhg[i]))
    
    #t-test for testing for significant differences between inter- and intra-concept similarities
    intra_avg = (intra_me + intra_en)/2
    t_inter_avg, p_inter_avg = stats.ttest_1samp(a=inter_avg_distr, popmean = intra_avg) 
    
    sim_intra.write(target + '\t' + str(intra_me) + '\t' + str(intra_en) + '\t' + str(intra_avg) + '\t' + str(round(t_inter_avg,8)) + '\t' + str(round(p_inter_avg,8)) + '\n')