**Note**: 

- The results below correspond to Analysis 2 in the paper.
- *Outputs*: 
    - Spearman R of multivariate regression.
    - Variable coefficients from multivariate regression.

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, csr_matrix, save_npz
from scipy.io import loadmat
from scipy.stats import pearsonr, spearmanr
import math
from sklearn.preprocessing import scale
from sklearn import linear_model
from sklearn.metrics import r2_score
from IPython.display import clear_output
from mlxtend.evaluate import permutation_test
import matplotlib as mpl
mpl.use('TkAgg')
%matplotlib inline
import seaborn as sns

In [3]:
from pylab import rcParams

params = {'axes.spines.top'    : False,
          'axes.spines.right'  : False,
          'axes.labelsize': 12, 
          'axes.titlesize': 12,
          'font.size': 12, 
          'legend.fontsize': 12, 
          'xtick.labelsize': 12,
          'ytick.labelsize': 12,
          'ytick.direction':'in',
          'figure.figsize': [5,5],
          'font.family': 'serif',}

mpl.pyplot.rcParams.update(params)

In [4]:
matrices = loadmat('data/colexMats.mat')

In [5]:
sumMat = []

for key in matrices.keys():
    if type(matrices[key]) == type(matrices['207']):
        sumMat.append(matrices[key])

In [6]:
totalMat = sum(sumMat)

In [7]:
eng_ex = load_npz('data/eng_ex.npz')

In [8]:
upper = set(zip(*np.triu_indices(eng_ex.shape[0])))
diagonal = set([(i,i) for i in range(eng_ex.shape[0])])
english = set(zip(*np.nonzero(eng_ex)))
non_zero = set(zip(*np.nonzero(totalMat)))
    
final_ind = (upper & non_zero) - diagonal - english
    
final_ind = np.array(list(final_ind))
final_ind = (final_ind[:,0],final_ind[:,1])

In [9]:
family = loadmat('data/colex_family.mat')
climate = loadmat('data/colex_climate.mat')
geography = loadmat('data/colex_geography.mat')

# Import matrices

In [10]:
association = load_npz('data/hbc.npz')
frequency = load_npz('data/frequency.npz')
similarity = load_npz('data/w2v.npz')
concrete = load_npz('data/conc.npz')
valence = load_npz('data/valence.npz')

In [11]:
association_array = association.toarray()[final_ind]
frequency_array = frequency.toarray()[final_ind]
similarity_array = similarity.toarray()[final_ind]
concrete_array = concrete.toarray()[final_ind]
valence_array = valence.toarray()[final_ind]

In [12]:
scale_asso = scale(association_array)
scale_freq = scale(frequency_array)
scale_sim = scale(similarity_array)
scale_conc = scale(concrete_array)
scale_val = scale(valence_array)

In [18]:
arrays = pd.DataFrame(data = {0: scale_asso, 1: scale_sim,
                     2:scale_conc,3:scale_val,
                     4:scale_freq})

arrays.to_csv('inputs.csv',index = False)

# Multivariate Regression

In [11]:
X = np.array(list(zip(*[scale_asso,scale_freq,scale_sim,scale_conc,scale_val])))

In [12]:
def r_square(X,y):
    x = np.array(X)
    regr = linear_model.LinearRegression()
    regr.fit(X, y)
    y_pred = regr.predict(X)
    return ((regr.coef_, r2_score(y,y_pred)))

def normal_r(coeffs,X,y):
    pred = 0 
    for i in range(5):
        pred += X[:,i]*coeffs[i]
    return(pearsonr(pred,y)[0],spearmanr(pred,y)[0])


## Control Family

In [13]:
coefficients_fam_hbc = []
spearman = []


for i in range(1000):
    clear_output(wait=True)
    print('Sample #: {}'.format(i))
    fam_array = scale(family[str(i)].toarray()[final_ind])
    coeff,r2 = r_square(X,fam_array)
    pear,spear = normal_r(coeff,X,fam_array)
    coefficients_fam_hbc.append(coeff)
    pearson.append(pear)
    spearman.append(spear)

Sample #: 999


In [14]:
print('Pearson, control Family')
print(round(np.mean(pearson),3))
print(round(np.mean(pearson) - 1.96* np.std(pearson),3), round(np.mean(pearson) + 1.96* np.std(pearson),3))
print()
print('Spearman, control Family')
print(round(np.mean(spearman),3))
print(round(np.mean(spearman) - 1.96* np.std(spearman),3), round(np.mean(spearman) + 1.96* np.std(spearman),3))
print(round(np.std(spearman),3))

Pearson, control Family
0.343
0.337 0.35

Spearman, control Family
0.254
0.238 0.27
0.008


## Control Geography

In [15]:
r_2= [] 
coefficients_geo_hbc = []
pearson = []
spearman = []


for i in range(1000):
    clear_output(wait=True)
    print('Sample #: {}'.format(i))
    fam_array = scale(geography[str(i)].toarray()[final_ind])
    coeff,r2 = r_square(X,fam_array)
    pear,spear = normal_r(coeff,X,fam_array)
    r_2.append(r2)
    coefficients_geo_hbc.append(coeff)
    pearson.append(pear)
    spearman.append(spear)

Sample #: 999


In [16]:
print('R-Square, control Geography')
print(round(np.mean(r_2),3))
print(round(np.mean(r_2) - 1.96* np.std(r_2),3), round(np.mean(r_2) + 1.96* np.std(r_2),3))
print()
print('Pearson, control Geography')
print(round(np.mean(pearson),3))
print(round(np.mean(pearson) - 1.96* np.std(pearson),3), round(np.mean(pearson) + 1.96* np.std(pearson),3))
print()
print('Spearman, control Geography')
print(round(np.mean(spearman),3))
print(round(np.mean(spearman) - 1.96* np.std(spearman),3), round(np.mean(spearman) + 1.96* np.std(spearman),3))
print(round(np.std(spearman),3))

R-Square, control Geography
0.088
0.069 0.107

Pearson, control Geography
0.296
0.264 0.328

Spearman, control Geography
0.217
0.177 0.257
0.02


## Control Climate

In [17]:
r_2= [] 
coefficients_clim_hbc = []
pearson = []
spearman = []


for i in range(1000):
    clear_output(wait=True)
    print('Sample #: {}'.format(i))
    fam_array = scale(climate[str(i)].toarray()[final_ind])
    coeff,r2 = r_square(X,fam_array)
    pear,spear = normal_r(coeff,X,fam_array)
    r_2.append(r2)
    coefficients_clim_hbc.append(coeff)
    pearson.append(pear)
    spearman.append(spear)

Sample #: 999


In [18]:
print('R-Square, control Climate')
print(round(np.mean(r_2),3))
print(round(np.mean(r_2) - 1.96* np.std(r_2),3), round(np.mean(r_2) + 1.96* np.std(r_2),3))
print()
print('Pearson, control Climate')
print(round(np.mean(pearson),3))
print(round(np.mean(pearson) - 1.96* np.std(pearson),3), round(np.mean(pearson) + 1.96* np.std(pearson),3))
print()
print('Spearman, control Climate')
print(round(np.mean(spearman),3))
print(round(np.mean(spearman) - 1.96* np.std(spearman),3), round(np.mean(spearman) + 1.96* np.std(spearman),3))
print(round(np.std(spearman),3))

R-Square, control Climate
0.096
0.076 0.117

Pearson, control Climate
0.31
0.277 0.343

Spearman, control Climate
0.232
0.178 0.286
0.028


# Output coefficients to CSV

In [19]:
df1 = pd.DataFrame(data=coefficients_fam_hbc,columns = ['hbc','frequency','w2v','concrete','valence'])

In [20]:
df2 = pd.DataFrame(data=coefficients_clim_hbc,columns = ['hbc','frequency','w2v','concrete','valence'])
df3 = pd.DataFrame(data=coefficients_geo_hbc,columns = ['hbc','frequency','w2v','concrete','valence'])

In [21]:
writer = pd.ExcelWriter('output_coefficients.xlsx')
df1.to_excel(writer,'hbc_family')
df2.to_excel(writer,'hbc_climate')
df3.to_excel(writer,'hbc_geography')
writer.save()

In [4]:
xls = pd.ExcelFile('figures/output_coefficients.xlsx')
fam = pd.read_excel(xls, 'hbc_family')
clim = pd.read_excel(xls, 'hbc_climate')
geo = pd.read_excel(xls, 'hbc_geography')