**Note**: 

- The results below correspond to Analysis 2 in the paper.
- *Outputs*: Spearman R of HBC, W2V, concreteness, valence, frequency, controlled for three factors: family, geography, and climate

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, csr_matrix
from scipy.io import loadmat
from scipy.stats import pearsonr, spearmanr
import pickle as pk
from IPython.display import clear_output

In [2]:
matrices = loadmat('data/colexMats.mat')

In [10]:
def printResult(alist):
    print('mean: ',round(np.mean(alist),3))
    print('95% ci: ', round(np.mean(alist) - 1.96*np.std(alist),3), round(np.mean(alist) + 1.96*np.std(alist),3))

# Extract non-zero elements

In [3]:
sumMat = []

for key in matrices.keys():
    if type(matrices[key]) == type(matrices['207']):
        sumMat.append(matrices[key])

totalMat = sum(sumMat)

In [4]:
# Pairs that are identical in English

eng_ex = load_npz('data/eng_ex.npz')

In [5]:
upper = set(zip(*np.triu_indices(eng_ex.shape[0])))
diagonal = set([(i,i) for i in range(eng_ex.shape[0])])
english = set(zip(*np.nonzero(eng_ex)))
non_zero = set(zip(*np.nonzero(totalMat)))
    
final_ind = (upper & non_zero) - diagonal - english
    
final_ind = np.array(list(final_ind))
final_ind = (final_ind[:,0],final_ind[:,1])

In [7]:
hbc = load_npz('data/hbc.npz')
w2v = load_npz('data/w2v.npz')
conc = load_npz('data/conc.npz')
val = load_npz('data/valence.npz')
freq = load_npz('data/frequency.npz')

In [8]:
hbc_array = np.array(hbc.toarray()[final_ind])
w2v_array = np.array(w2v.toarray()[final_ind])
conc_array = np.array(conc.toarray()[final_ind])
val_array = np.array(val.toarray()[final_ind])
freq_array = np.array(freq.toarray()[final_ind])

In [9]:
family = loadmat('data/colex_family.mat')
climate = loadmat('data/colex_climate.mat')
geography = loadmat('data/colex_geography.mat')

# Linear Regression

## Control Family

In [13]:
predictors = [hbc_array, w2v_array, conc_array, val_array, freq_array]

Family = {}

for i in range(5): 
    spearman = []
    
    for j in range(1000):
        clear_output(wait=True)
        print('Sample #: {}'.format(j))
        ids = family[str(j)].toarray()[final_ind]
        spearman.append(spearmanr(predictors[i],ids)[0])
        
    Family[i] = spearman 

Sample #: 999


In [15]:
title = ['HBC','W2V','CONCRETENESS','VALENCE','FREQUENCY']

for i in range(5):
    print('-------------------------------------')
    print('SPEARMAN R of {}'.format(title[i]))
    print('(Family-controlled)')
    print('-------------------------------------')
    printResult(Family[i])
    print()

-------------------------------------
SPEARMAN R of HBC
(Family-controlled)
-------------------------------------
mean:  0.274
95% ci:  0.262 0.286

-------------------------------------
SPEARMAN R of W2V
(Family-controlled)
-------------------------------------
mean:  0.225
95% ci:  0.209 0.242

-------------------------------------
SPEARMAN R of CONCRETENESS
(Family-controlled)
-------------------------------------
mean:  -0.113
95% ci:  -0.132 -0.094

-------------------------------------
SPEARMAN R of VALENCE
(Family-controlled)
-------------------------------------
mean:  -0.079
95% ci:  -0.092 -0.065

-------------------------------------
SPEARMAN R of FREQUENCY
(Family-controlled)
-------------------------------------
mean:  -0.037
95% ci:  -0.054 -0.021



## Control Climate

In [18]:
Climate = {}

for i in range(5): 
    spearman = []
    
    for j in range(1000):
        clear_output(wait=True)
        print('Sample #: {}'.format(j))
        ids = climate[str(j)].toarray()[final_ind]
        spearman.append(spearmanr(predictors[i],ids)[0])
        
    Climate[i] = spearman 

Sample #: 999


In [19]:
for i in range(5):
    print('-------------------------------------')
    print('SPEARMAN R of {}'.format(title[i]))
    print('(Climate-controlled)')
    print('-------------------------------------')
    printResult(Climate[i])
    print()

-------------------------------------
SPEARMAN R of HBC
(Climate-controlled)
-------------------------------------
mean:  0.26
95% ci:  0.223 0.297

-------------------------------------
SPEARMAN R of W2V
(Climate-controlled)
-------------------------------------
mean:  0.189
95% ci:  0.121 0.258

-------------------------------------
SPEARMAN R of CONCRETENESS
(Climate-controlled)
-------------------------------------
mean:  -0.079
95% ci:  -0.142 -0.017

-------------------------------------
SPEARMAN R of VALENCE
(Climate-controlled)
-------------------------------------
mean:  -0.075
95% ci:  -0.095 -0.056

-------------------------------------
SPEARMAN R of FREQUENCY
(Climate-controlled)
-------------------------------------
mean:  0.017
95% ci:  -0.087 0.121



## Control Geography

In [16]:
Geography = {}

for i in range(5): 
    spearman = []
    
    for j in range(1000):
        clear_output(wait=True)
        print('Sample #: {}'.format(j))
        ids = geography[str(j)].toarray()[final_ind]
        spearman.append(spearmanr(predictors[i],ids)[0])
        
    Geography[i] = spearman 

Sample #: 999


In [17]:
for i in range(5):
    print('-------------------------------------')
    print('SPEARMAN R of {}'.format(title[i]))
    print('(Geography-controlled)')
    print('-------------------------------------')
    printResult(Geography[i])
    print()

-------------------------------------
SPEARMAN R of HBC
(Geography-controlled)
-------------------------------------
mean:  0.262
95% ci:  0.236 0.287

-------------------------------------
SPEARMAN R of W2V
(Geography-controlled)
-------------------------------------
mean:  0.185
95% ci:  0.145 0.226

-------------------------------------
SPEARMAN R of CONCRETENESS
(Geography-controlled)
-------------------------------------
mean:  -0.078
95% ci:  -0.117 -0.039

-------------------------------------
SPEARMAN R of VALENCE
(Geography-controlled)
-------------------------------------
mean:  -0.07
95% ci:  -0.09 -0.05

-------------------------------------
SPEARMAN R of FREQUENCY
(Geography-controlled)
-------------------------------------
mean:  -0.01
95% ci:  -0.068 0.048

