In [17]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import tqdm
import json
from collections import defaultdict

import Support.support_data as load
import Support.support_aggregation as aggr
import Support.support_prediction as pred

%matplotlib inline
import matplotlib.pyplot as plt

# Read Data

In [2]:
glasgow_file = 'Data/words_glasgow.csv' # Features from Glasgow Norms
plets_file = 'Data/4plets' # Free associations from SWoW
freq_age_file = 'Data/LogFreqandAgeofAcq.txt' # Age from Kuperman
freq_file = 'Data/word_freqs.txt' # Frequency from OpenSubtitles

filter_words,\
ordered_plets,\
feature_dicts,\
feature_names,\
new_norm_plets,\
freq_plets = load.read_pipeline(glasgow_file, plets_file, freq_age_file, freq_file)

100%|███████████████████████████████████████████████████████████████████| 1070760/1070760 [00:00<00:00, 2126162.45it/s]
100%|█████████████████████████████████████████████████████████████████████████| 69817/69817 [00:00<00:00, 81684.21it/s]


In [4]:
feature_dicts['dog']

{'length': [3],
 'arousal': [6.147],
 'valence': [7.067],
 'dominance': [6.242],
 'concreteness': [6.833],
 'semsize': [3.273],
 'gender': [4.606],
 'familiarity': [6.84],
 'polysemy': [8],
 'frequency': [10.3702985908827],
 'aoa': [2.8]}

In [6]:
nodes = sorted(list(feature_dicts.keys()))

# Graph and Hypergraph Creation

In [7]:
# strategy='all' : all words in the response are connected
# strategy='g1' : cue is connected to the first response
# strategy='g123' : cue is connected to all the responses

g = load.create_graph(filter_words, ordered_plets, feature_dicts, strategy='g123')
to_rem = [n for n in g.nodes() if n not in filter_words]
g.remove_nodes_from(to_rem)

print('Graph')
print('N:', g.number_of_nodes(), 'L:', g.number_of_edges())

100%|████████████████████████████████████████████████████████████████████| 1129795/1129795 [00:05<00:00, 217793.36it/s]


Graph
N: 3586 L: 165690


In [8]:
h = load.create_hypergraph(filter_words, new_norm_plets, feature_dicts, feature_names, freq_plets, glasgow_attrs=True)
nodes_in_he = load.hypergraph_neighbors(h)

print('Hypergraph')
print('N:', len(h.get_node_set()), 'L:',len(h.get_hyperedge_id_set()))

100%|█████████████████████████████████████████████████████████████████████████| 67600/67600 [00:00<00:00, 95677.22it/s]


Hypergraph
N: 3586 L: 67600


# Aggregation Strategies

In [9]:
# No aggregation
to_df_attr = aggr.no_aggr(nodes, feature_names, feature_dicts)

In [10]:
# Graph Ego-Network
to_df_graph = aggr.graph_ego_net(g, feature_names, w=False)

# Weighted Graph Ego-Networks
to_df_graph_w = aggr.graph_ego_net(g, feature_names, w=True)

In [11]:
# Graph Community: Louvain and EVA
#to_df_evas = aggr.louv_eva(g, feature_names, feature_dicts)

In [12]:
# Graph Community: Lemon
##### ONLY ONCE ####
# aggr.run_lemon(g, savefile='Data/lemon_plets.txt')
####################
lemon_plets = load.read_lemon_plets('Data/lemon_plets.txt')
to_df_lemon = aggr.lemon(g, lemon_plets, feature_names) 

100%|████████████████████████████████████████████████████████████████████████████| 3586/3586 [00:01<00:00, 1969.98it/s]


In [13]:
# Hypergraph Ego-Network
to_df_hyper = aggr.hypergraph_ego_net(h, feature_names)

### Aggregation Example

In [14]:
example = 'dog'
variable = 'length'
ind = nodes.index(example)

print('Non Network: ' + str(to_df_attr[variable][ind]))
print('Graph Ego-Network: ' +  str(to_df_graph[variable][ind]))
print('Weighted Graph Ego-Network: ' +  str(to_df_graph_w[variable][ind]))
#print('Graph Community -- Louvain: ' +  str(to_df_evas[0][variable][ind]))
#print('Graph Community -- EVA: ' +  str(to_df_evas[1][variable][ind]))
print('Graph Community -- Lemon: ' +  str(to_df_lemon[variable][ind]))
print('Hypergraph Ego-Network: ' +  str(to_df_hyper[variable][ind]))

Non Network: 3
Graph Ego-Network: 5.656934306569343
Weighted Graph Ego-Network: 1.870456204379562
Graph Community -- Lemon: 4.4375
Hypergraph Ego-Network: 4.50291757840992


# Prediction

In [15]:
dfs, dfs_names = pred.format_dataframes (
    nodes, #list of words
    to_df_attr, # non-network
    to_df_graph_w, # graph ego-network
    [],
    [],
    #to_df_evas[0], # graph louvain
    #to_df_evas[1], # graph eva
    to_df_lemon, # graph lemon
    to_df_hyper # hypergraph ego-network
)

In [16]:
dfs_names

['attr', 'egonet', 'lemon', 'hego']

In [None]:
# rf: random forest regressor;
# linear: linear regression;
# ada: adaboost regressor;
# svr: support vector regressor;

res_all = defaultdict(lambda: defaultdict(dict))
#for to_pred in tqdm.tqdm(feature_names):
for to_pred in tqdm.tqdm(['concreteness']):
    print(to_pred)
    res = pred.ml_cv_pipeline(dfs, dfs_names, to_pred, which='rf', n_cv=10) 
    res_all[to_pred] = res
    
    print(res)

In [None]:
#open('Data/Predictions.json', 'w') as outfile:
#    json.dump(res_all, outfile)

# Viz

In [None]:
res_pred = json.load(open('Data/Predictions.json'))

for variable in feature_names:
    evals = 'RMSE'

    to_pl_m = []
    to_pl_std = []
    for name in ['attr', 'egonet', 'louv', 'eva', 'lemon', 'hego']:
        to_pl_m.append(abs(res_pred[variable][name][evals]['M']))
        to_pl_std.append(res_pred[variable][name][evals]['STD']/np.sqrt(10))

    plt.figure(figsize=(7,4.5))
    width = 0.52
    plt.bar(range(len(to_pl_m)), to_pl_m, width, color='blue', alpha=0.5, )
    plt.errorbar(range(len(to_pl_m)), to_pl_m, elinewidth=2, yerr=to_pl_std,
                 fmt='none',
                 color='k', alpha=0.8)

    for i, v in enumerate(to_pl_m):
        plt.text(i-0.18, round(v,2)+0.05, str(round(v,2)), fontsize=16)

    plt.ylim(0,max(to_pl_m)+0.2)
    plt.xticks(range(len(to_pl_m)), ['Non-Net', 'G:Ego-Net', 'G:Louvain', 'G:EVA', 'G:Lemon', 'Hypergraph'],
               rotation=30, fontsize=16)
    plt.yticks(fontsize=17)
    plt.ylabel(evals, fontsize=20)
    #plt.title(variable, fontsize=20)
    plt.tight_layout()
    
    plt.savefig('Figures/bar_preds/'+str(evals)+'_'+variable+'.png', bbox_inches='tight')

    #plt.show()

In [None]:
#_, _, _, _, df = load.load_glasgow_attributes(glasgow_file)

In [None]:
#import seaborn as sns

In [None]:
#sns.heatmap(df.corr(), annot=True)