In [1]:
import pandas as pd
import numpy as np

In [2]:
cra_data_raw = pd.read_csv('/Users/dlurie/Dropbox/Coursework/computational_models/final_project/144_cleaned.csv')

In [3]:
# Specify items to exclude based on specific experimental concerns.
exclude = [0,19]

In [4]:
cra_data_keep = cra_data_raw.drop(cra_data_raw.index[exclude])

In [5]:
cra_data_keep = cra_data_keep.dropna(axis=0)

In [6]:
cra_data_keep['in_lexicon'] = 'NaN'

In [7]:
sn_data = pd.read_csv('/Users/dlurie/Dropbox/Coursework/computational_models/final_project/nelson_fa_norms_nx_node_metrics.csv')

In [8]:
lexicon = pd.DataFrame(list(sn_data.Label.values), columns=['Label'])

In [9]:
lexicon['dtype'] = [type(x) for x in lexicon.Label]

In [10]:
lexicon[lexicon.dtype == float]

Unnamed: 0,Label,dtype
5521,,<type 'float'>


In [11]:
lexicon = lexicon.drop(lexicon.index[[5521]])

In [12]:
lexicon_list = [x.lower() for x in lexicon.Label.values]

In [13]:
for i in cra_data_keep.index:
    cra_data_keep.loc[i, 'in_lexicon'] = all([x in lexicon_list for x in cra_data_keep.loc[i][0:4].values])

In [14]:
np.unique(cra_data_keep.in_lexicon)

array([False, True], dtype=object)

In [15]:
cra_data = cra_data_keep[cra_data_keep['in_lexicon'] == True]

In [16]:
len(cra_data)

118

In [17]:
metrics = ['page_rank', 'closeness', 'betweenness', 'in_strength', 'out_strength', 'in_dc', 'out_dc', 'degree', 'strength']

In [18]:
new_columns = []
for col in ['cue1', 'cue2', 'cue3', 'solution']:
    for metric in metrics:
        col_name = '_'.join([col, metric])
        new_columns.append(col_name)

In [21]:
for i in new_columns:
    cra_data.loc[:,i] = 'NaN'

In [22]:
for i in cra_data.index:
    for col in ['cue1', 'cue2', 'cue3', 'solution']:
        word = cra_data.loc[i][col]
        word_data = sn_data[sn_data['Label'] == word.upper()]
        cra_data.loc[i, '_'.join([col, 'page_rank'])] = word_data['page_rank'].item()
        cra_data.loc[i, '_'.join([col, 'closeness'])] = word_data['closeness_centrality'].item()
        cra_data.loc[i, '_'.join([col, 'betweenness'])] = word_data['betweenness_centrality'].item()
        cra_data.loc[i, '_'.join([col, 'in_strength'])] = word_data['in_strength'].item()
        cra_data.loc[i, '_'.join([col, 'out_strength'])] = word_data['out_strength'].item()
        cra_data.loc[i, '_'.join([col, 'in_dc'])] = word_data['in_degree_centrality'].item()
        cra_data.loc[i, '_'.join([col, 'out_dc'])] = word_data['out_degree_centrality'].item()
        cra_data.loc[i, '_'.join([col, 'degree'])] = word_data['degree'].item()
        cra_data.loc[i, '_'.join([col, 'strength'])] = word_data['strength'].item()   

In [83]:
cra_data.to_csv('/Users/dlurie/Dropbox/Coursework/computational_models/final_project/cra_data_with_node_metrics.csv', index=False)

In [84]:
nodal_data = pd.read_csv('/Users/dlurie/Dropbox/Coursework/computational_models/final_project/cra_data_with_node_metrics.csv')

In [77]:
data_cols = ['2sec_pct_solving','7sec_pct_solving', '7sec_mean_time','15sec_pct_solving', '15sec_mean_time',
             '30sec_pct_solving', '30sec_mean_time','2sec_pct_solving','7sec_pct_solving', '7sec_mean_time', 
             '15sec_pct_solving', '15sec_mean_time','cue1_page_rank', 'cue1_closeness', 'cue1_betweenness',
             'cue1_in_strength', 'cue1_out_strength','cue1_in_dc', 'cue1_out_dc', 'cue1_degree', 
             'cue1_strength','cue2_page_rank', 'cue2_closeness','cue2_betweenness', 'cue2_in_strength',
             'cue2_out_strength','cue2_in_dc', 'cue2_out_dc',
             'cue2_degree', 'cue2_strength', 'cue3_page_rank', 'cue3_closeness','cue3_betweenness',
             'cue3_in_strength', 'cue3_out_strength', 'cue3_in_dc', 'cue3_out_dc', 'cue3_degree',
             'cue3_strength', 'solution_page_rank', 'solution_closeness','solution_betweenness', 
             'solution_in_strength','solution_out_strength', 'solution_in_dc', 'solution_out_dc',
             'solution_degree', 'solution_strength']

In [80]:
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("darkgrid")

In [None]:
f, ax = plt.subplots(figsize=(22, 22))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.corrplot(nodal_data[data_cols], annot=True, sig_stars=True, sig_corr=False,
             diag_names=False, cmap=cmap, ax=ax, cbar=False)
f.tight_layout()
f.savefig('/Users/dlurie/Dropbox/Coursework/computational_models/final_project/corr_mat_full.png')

###Calculate Path Lengths

In [24]:
import networkx as nx

In [36]:
fa_network = nx.read_graphml('/Users/dlurie/Dropbox/Coursework/computational_models/final_project/nelson_fa_norms_nx.graphml')

In [38]:
labels = nx.get_node_attributes(fa_network, 'label')

In [39]:
fa_relabel = nx.relabel_nodes(fa_network, labels)

In [63]:
path_length_cols = ['cue1_solution_dijkstra_spl', 'cue2_solution_dijkstra_spl', 'cue3_solution_dijkstra_spl',
                   'cue1_solution_standard_spl', 'cue2_solution_standard_spl', 'cue3_solution_standard_spl']

In [64]:
for i in path_length_cols:
    cra_data.loc[:,i] = 'NaN'

In [70]:
for i in cra_data.index:
    src = cra_data.loc[i, 'cue1']
    trgt = cra_data.loc[i, 'solution']
    try:
        dijkstra_spl = nx.dijkstra_path_length(fa_relabel, src.upper(), trgt.upper(), weight='weight')
        cra_data.loc[i, 'cue1_solution_dijkstra_spl'] = dijkstra_spl
    except:
        pass
    try:
        standard_spl = nx.shortest_path_length(fa_relabel, src.upper(), trgt.upper())
        cra_data.loc[i, 'cue1_solution_standard_spl'] = standard_spl
    except:
        pass

In [72]:
for i in cra_data.index:
    src = cra_data.loc[i, 'cue2']
    trgt = cra_data.loc[i, 'solution']
    try:
        dijkstra_spl = nx.dijkstra_path_length(fa_relabel, src.upper(), trgt.upper(), weight='weight')
        cra_data.loc[i, 'cue2_solution_dijkstra_spl'] = dijkstra_spl
    except:
        pass
    try:
        standard_spl = nx.shortest_path_length(fa_relabel, src.upper(), trgt.upper())
        cra_data.loc[i, 'cue2_solution_standard_spl'] = standard_spl
    except:
        pass

In [74]:
for i in cra_data.index:
    src = cra_data.loc[i, 'cue3']
    trgt = cra_data.loc[i, 'solution']
    try:
        dijkstra_spl = nx.dijkstra_path_length(fa_relabel, src.upper(), trgt.upper(), weight='weight')
        cra_data.loc[i, 'cue3_solution_dijkstra_spl'] = dijkstra_spl
    except:
        pass
    try:
        standard_spl = nx.shortest_path_length(fa_relabel, src.upper(), trgt.upper())
        cra_data.loc[i, 'cue3_solution_standard_spl'] = standard_spl
    except:
        pass

In [101]:
nodal_data['cue_page_rank_sum'] = nodal_data['cue1_page_rank'] + nodal_data['cue2_page_rank'] + nodal_data['cue3_page_rank']
nodal_data['cue_page_rank_mean'] = (nodal_data['cue1_page_rank'] + nodal_data['cue2_page_rank'] + nodal_data['cue3_page_rank']) / 3
nodal_data['cue_betweenness_sum'] = nodal_data['cue1_betweenness'] + nodal_data['cue2_betweenness'] + nodal_data['cue3_betweenness']
nodal_data['cue_betweenness_mean'] = (nodal_data['cue1_betweenness'] + nodal_data['cue2_betweenness'] + nodal_data['cue3_betweenness']) / 3
nodal_data['cue_closeness_sum'] = nodal_data['cue1_closeness'] + nodal_data['cue2_closeness'] + nodal_data['cue3_closeness']
nodal_data['cue_closeness_mean'] = (nodal_data['cue1_closeness'] + nodal_data['cue2_closeness'] + nodal_data['cue3_closeness']) / 3

In [105]:
nodal_data['mean_cue_solution_dijkstra_spl'] = (nodal_data['cue1_solution_dijkstra_spl'] + nodal_data['cue2_solution_dijkstra_spl'] + nodal_data['cue3_solution_dijkstra_spl']) / 3
nodal_data['mean_cue_solution_standard_spl'] = (nodal_data['cue1_solution_standard_spl'] + nodal_data['cue2_solution_standard_spl'] + nodal_data['cue3_solution_standard_spl']) / 3

In [108]:
test_cols = data_cols + path_length_cols + ['cue_page_rank_sum', 'cue_page_rank_mean', 'cue_closeness_sum', 'cue_closeness_mean', 'cue_betweenness_sum', 'cue_betweenness_mean', 'mean_cue_solution_dijkstra_spl','mean_cue_solution_standard_spl']

In [None]:
f, ax = plt.subplots(figsize=(24, 24))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.corrplot(nodal_data[test_cols], annot=True, sig_stars=True, sig_corr=False,
             diag_names=False, cmap=cmap, ax=ax, cbar=False)
f.tight_layout()
f.savefig('/Users/dlurie/Dropbox/Coursework/computational_models/final_project/corr_mat_full.png')

In [50]:
fa_relabel['EXPLAIN']['EXCUSE']

{'cost': 47.6666688911, 'weight': 0.02097902}

In [53]:
nx.dijkstra_path_length(fa_relabel, 'CREAM', 'LOSER', weight='weight')

0.071406152