**Note**: 

- The results below correspond to Analysis 1 in the paper.
- Output: Predictive accuracies of association using HBC, w2v, Concreteness, Valence, Usage Frequency (reported in Figure 2A).
- The results got transfered to file CombinationModel_Log.ipynb for graphing Figure 2A.

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, csr_matrix
from scipy.io import loadmat
from scipy.stats import pearsonr, spearmanr
import pickle as pk
from IPython.display import clear_output
import random
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from mlxtend.evaluate import permutation_test
from scipy.stats import ttest_1samp

In [2]:
matrices = loadmat('data/colexMats.mat')
size = matrices['135'].shape
min_NLang = 2

In [3]:
def printResult(list):
    print("mean: ", round(np.mean(list),3))
    print("95% ci: ", round(np.mean(list) - np.std(list),3), round(np.mean(list) + np.std(list),3))

In [4]:
hbc = load_npz('data/hbc.npz')
concreteness = load_npz('data/conc.npz')
valence = load_npz('data/valence.npz')
frequency = load_npz('data/frequency.npz')
w2v = load_npz('data/w2v.npz')

# Define binary matrix

In [5]:
"""If there is one language that colexifies a pair: 
    entry = 1 
else:
    entry = 0""" 

individual_binarize = {}

for i in matrices:
    if type(matrices[i]) == type(matrices['207']):
        ind = np.nonzero(matrices[i])
        mat = np.zeros(size)
        mat[ind] = 1 
        individual_binarize[i] = mat 

In [6]:
sumMat = sum(individual_binarize.values())

In [7]:
# Pairs that are identical in English

eng_ex = load_npz('data/eng_ex.npz')
english = set(zip(*np.nonzero(eng_ex)))

In [8]:
colexified = np.argwhere(sumMat >= min_NLang)
colex_set = set([tuple(i) for i in colexified]) - english

In [9]:
colex_ind = np.array(list(colex_set))
colex_ind = (colex_ind[:,0],colex_ind[:,1])

In [10]:
all_ind = list(zip(*np.indices(size).reshape(2, size[0]*size[0])))

In [11]:
zero_set = set(all_ind) - colex_set - english

In [12]:
def n_fold(matrix, target): #Return mean accuracy of n-fold 
    
    data = np.array(list(matrix.toarray()[colex_ind]) + 
                    list(list(matrix.toarray()[sample_zero])))
    
    target = target.reshape(-1,1)
    data = data.reshape(-1,1)
    
    acc = [] #Accuracy 
    
    for fold in range(20):
        X_train, X_test, y_train, y_test = \
        train_test_split(data, target, test_size=0.1)
        
        logistic = linear_model.LogisticRegression()
        acc.append(logistic.fit(X_train, y_train).score(X_test, y_test))
    return(np.mean(acc))

In [13]:
# Perform logistic regression

hbc_res = []
conc_res = []
val_res = []
freq_res = []
w2v_res = []

inputs = [hbc, concreteness, valence,frequency, w2v]
outputs = [hbc_res, conc_res, val_res, freq_res, w2v_res]

for rep in range(100):
    clear_output(wait=True)
    print('Sample #: {}'.format(rep))
    sample_zero = random.sample(list(zero_set),len(colex_set))
    sample_zero = np.array(list(sample_zero))
    sample_zero = (sample_zero[:,0],sample_zero[:,1])
    
    target = np.array([1]*len(colex_set) + [0]*len(colex_set))
    
    for i in range(5):
        acc = n_fold(inputs[i], target)
        outputs[i].append(np.mean(acc))

Sample #: 99


In [14]:
title = ['HBC','CONCRETENESS','VALENCE','FREQUENCY','W2V']

for i in range(5):
    print('-----------------------------------')
    print('PREDICTIVE ACCURACY of {}'.format(title[i]))
    print('-----------------------------------')
    printResult(outputs[i])
    print()

-----------------------------------
PREDICTIVE ACCURACY of HBC
-----------------------------------
mean:  0.667
95% ci:  0.663 0.67

-----------------------------------
PREDICTIVE ACCURACY of CONCRETENESS
-----------------------------------
mean:  0.609
95% ci:  0.605 0.614

-----------------------------------
PREDICTIVE ACCURACY of VALENCE
-----------------------------------
mean:  0.582
95% ci:  0.578 0.587

-----------------------------------
PREDICTIVE ACCURACY of FREQUENCY
-----------------------------------
mean:  0.501
95% ci:  0.495 0.507

-----------------------------------
PREDICTIVE ACCURACY of W2V
-----------------------------------
mean:  0.781
95% ci:  0.778 0.785

