**Note**: 

- The result below correponds to Analysis 1 in the paper.
- Output: Predictive accuracy of association using USF (reported in Figure 2A).
- This result got transfered to file CombinationModel_Log.ipynb for graphing Figure 2A.

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, csr_matrix
from scipy.io import loadmat
from scipy.stats import pearsonr, spearmanr
import pickle as pk
from IPython.display import clear_output
import random
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [2]:
matrices = loadmat('data/usf_colexMats.mat')
size = matrices['135'].shape
min_NLang = 2

In [3]:
def printResult(list):
    print('mean: ', round(np.mean(list),3))
    print('95% ci: ',round(np.mean(list) - 1.96*np.std(list),3), round(np.mean(list) + 1.96*np.std(list),3))

# Define binary matrix

In [4]:
"""If there is one language that colexifies a pair: 
    entry = 1 
else:
    entry = 0""" 

individual_binarize = {}

for i in matrices:
    if type(matrices[i]) == type(matrices['207']):
        ind = np.nonzero(matrices[i])
        mat = np.zeros(size)
        mat[ind] = 1 
        individual_binarize[i] = mat 

In [5]:
sumMat = sum(individual_binarize.values())

In [6]:
# Pairs that are identical in English
eng_ex = load_npz('data/usf_eng_ex.npz')
english = set(zip(*np.nonzero(eng_ex)))

In [7]:
colexified = np.argwhere(sumMat >= min_NLang)
colex_set = set([tuple(i) for i in colexified]) - english

In [8]:
colex_ind = np.array(list(colex_set))
colex_ind = (colex_ind[:,0],colex_ind[:,1])

In [9]:
all_ind = list(zip(*np.indices(size).reshape(2, size[0]*size[0])))

In [10]:
zero_set = set(all_ind) - colex_set - english

In [11]:
mean_hbc_trim = load_npz('data/usf.npz')

In [12]:
# Perform Logistic Regression
overall = [] 

for rep in range(100):
    clear_output(wait=True)
    print('Sample #: {}'.format(rep))
    sample_zero = random.sample(list(zero_set),len(colex_set))
    sample_zero = np.array(list(sample_zero))
    sample_zero = (sample_zero[:,0],sample_zero[:,1])
    
    target = np.array([1]*len(colex_set) + [0]*len(colex_set))
    data = np.array(list(mean_hbc_trim.toarray()[colex_ind]) + 
                    list(list(mean_hbc_trim.toarray()[sample_zero])))
    
    target = target.reshape(-1,1)
    data = data.reshape(-1,1)
    
    acc = [] #Accuracy 
    
    for fold in range(20):
        X_train, X_test, y_train, y_test = \
        train_test_split(data, target, test_size=0.1)
        
        logistic = linear_model.LogisticRegression()
        acc.append(logistic.fit(X_train, y_train).score(X_test, y_test))
    
    overall.append(np.mean(acc))

Sample #: 99


In [13]:
print('--------------------------')
print('PREDICTIVE ACCURACY of USF')
print('--------------------------')
printResult(overall)

--------------------------
PREDICTIVE ACCURACY of USF
--------------------------
mean:  0.66
95% ci:  0.654 0.666
