In [None]:
**Note**: 

- The results below correspond to Analysis 2 in the paper.
- *Outputs*: Spearman R of HBC, W2V, concreteness, valence, frequency, controlled for three factors: family, geography, and climate

In [9]:
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, csr_matrix
from scipy.io import loadmat
from scipy.stats import spearmanr
import pickle as pk
from IPython.display import clear_output

In [10]:
matrices = loadmat('data/usf_colexMats.mat')

In [22]:
def printResult(alist):
    print('mean: ',round(np.mean(alist),3))
    print('95% ci: ', round(np.mean(alist) - 1.96*np.std(alist),3), round(np.mean(alist) + 1.96*np.std(alist),3))

# Extract non-zero elements

In [3]:
sumMat = []

for key in matrices.keys():
    if type(matrices[key]) == type(matrices['207']):
        sumMat.append(matrices[key])

totalMat = sum(sumMat)

In [4]:
# Pairs that are identical in English

eng_ex = load_npz('data/usf_eng_ex.npz')

In [5]:
upper = set(zip(*np.triu_indices(eng_ex.shape[0])))
diagonal = set([(i,i) for i in range(eng_ex.shape[0])])
english = set(zip(*np.nonzero(eng_ex)))
non_zero = set(zip(*np.nonzero(totalMat)))
    
final_ind = (upper & non_zero) - diagonal - english
    
final_ind = np.array(list(final_ind))
final_ind = (final_ind[:,0],final_ind[:,1])

In [12]:
usf = load_npz('data/usf.npz')

In [13]:
association = np.array(usf.toarray()[final_ind])

In [19]:
family = loadmat('data/usf_colex_family.mat')
climate = loadmat('data/usf_colex_climate.mat')
geography = loadmat('data/usf_colex_geography.mat')

# Linear Regression

## Control Family

In [23]:
spearman = []

for i in range(1000):
    clear_output(wait=True)
    print('Sample #: {}'.format(i))
    ids = family[str(i)].toarray()[final_ind]
    spearman.append(spearmanr(association,ids)[0])

Sample #: 999


In [24]:
print('-------------------------------------')
print('SPEARMAN R of USF')
print('(Family-controlled)')
print('-------------------------------------')
printResult(spearman)
print()

-------------------------------------
SPEARMAN R of USF
(Family-controlled)
-------------------------------------
mean:  0.277
95% ci:  0.266 0.289



## Control Climate

In [27]:
spearman = []

for i in range(1000):
    clear_output(wait=True)
    print('Sample #: {}'.format(i))
    ids = climate[str(i)].toarray()[final_ind]
    spearman.append(spearmanr(association,ids)[0])

Sample #: 999


In [28]:
print('-------------------------------------')
print('SPEARMAN R of USF')
print('(Climate-controlled)')
print('-------------------------------------')
printResult(spearman)
print()

-------------------------------------
SPEARMAN R of USF
(Climate-controlled)
-------------------------------------
mean:  0.256
95% ci:  0.207 0.304



## Control Geography

In [25]:
spearman = []

for i in range(1000):
    clear_output(wait=True)
    print('Sample #: {}'.format(i))
    ids = geography[str(i)].toarray()[final_ind]
    spearman.append(spearmanr(association,ids)[0])

Sample #: 999


In [26]:
print('-------------------------------------')
print('SPEARMAN R of USF')
print('(Geography-controlled)')
print('-------------------------------------')
printResult(spearman)
print()

-------------------------------------
SPEARMAN R of USF
(Geography-controlled)
-------------------------------------
mean:  0.259
95% ci:  0.232 0.286

