The below environment is required for this notebook to work

In [None]:
print(__import__('sys').version)
!conda list -n NLP37

https://www.lfd.uci.edu/~gohlke/pythonlibs/#python-igraph

In [1]:
%pylab qt

from tqdm import tqdm
from toolz import curry, compose
from itertools import repeat
from types import FunctionType
from itertools import combinations
from dateutil.parser import parse
from datetime import datetime as dt
from datetime import timedelta
from dateutil.tz import tz
from matplotlib.ticker import StrMethodFormatter
from collections import Counter

import os
import re
import pytz
import community
import networkx as nx
import igraph as ig
import pandas as pd
import math

try:
    import cPickle as pickle
except:
    import pickle

Populating the interactive namespace from numpy and matplotlib


In [2]:
def save_pickle(filename, data):
    print('Pickling data...')
    with open(os.path.normpath(filename), 'wb') as open_file:
        pickle.dump(data, open_file)

def load_pickle(filename):
    print('Loading pickled data...')
    with open(os.path.normpath(filename), 'rb') as open_file:
        return pickle.load(open_file)

In [4]:
def unpack_input(data):
    output = {}
    for doc in data:
        date = doc['datetime']
        if date not in output:
            output[date] = {'docs': []}
        output[date]['docs'].append(doc)
    return output

In [5]:
def calculate_node_degree(data):
    
    for date in data:
        for doc in data[date]['docs']:
            for edge in doc['edges']:
                for node in edge['nodes']:
                    data[date]['nodes'][node] += edge['weight']
                
    return data
            

def sentence_significance(omissions, data):
    
    for date in data:
        for doc in data[date]['docs']:
            
            nodes = {}
            edges = []
            doc_ents = []
            
            for sentence in doc['sentences']:
                doc_ents.extend(sentence)
                
            doc_ents = {ent['word'] for ent in doc_ents if ent['tag'] not in omissions}
            
            for ent in doc_ents:
                if ent not in nodes:
                    nodes[ent] = {'weight': 0, 'neighbours': {}}
            
            for sentence in doc['sentences']:

                counts = entity_counter(sentence, omissions)
                sig, n_ent = compute_significance(counts)

                ents = combinations(list(sig.keys()), 2)

                for ent1, ent2 in ents:
                    
                    if ent1 != ent2:

                        if ent2 in nodes[ent1]['neighbours']:
                            nodes[ent1]['neighbours'][ent2]['weight'] += sig[ent1] + sig[ent2]
                        else:
                            edge = {'weight': sig[ent1] + sig[ent2], 'nodes': [ent1, ent2]}
                            nodes[ent1]['neighbours'][ent2] = edge
                            nodes[ent2]['neighbours'][ent1] = edge
                            edges.append(edge)
                        
            doc['nodes'] = nodes
            doc['edges'] = edges
    return data

def document_significance(omissions, data):
    
    for date in data:
        for doc in data[date]['docs']:
            
            doc_ents = []

            for sentence in doc['sentences']:
                doc_ents.extend(sentence)
                
            if 'nodes' not in doc:
                doc['nodes'] = {}
                doc['edges'] = []

            counts = entity_counter(doc_ents, omissions)
            sig, n_ent = compute_significance(counts)
            
            doc_ents = {ent['word'] for ent in doc_ents if ent['tag'] not in omissions}

            for ent in doc_ents:
                if ent not in doc['nodes']:
                    doc['nodes'][ent] = {'weight': 0, 'neighbours': {}}

            ents = combinations(doc_ents, 2)

            for ent1, ent2 in ents:
                    
                if ent1 in sig:
                    n1 = sig[ent1]
                else:
                    n1 = 0
                if ent2 in sig:
                    n2 = sig[ent2]
                else:
                    n2 = 0

                if ent2 in doc['nodes'][ent1]['neighbours']:
                    doc['nodes'][ent1]['neighbours'][ent2]['weight'] += n1 + n2
                else:
                    edge = {'weight': n1 + n2, 'nodes': [ent1, ent2]}
                    doc['nodes'][ent1]['neighbours'][ent2] = edge
                    doc['nodes'][ent2]['neighbours'][ent1] = edge
                    doc['edges'].append(edge)
    return data

def sentence_co_occurrence(omissions, data):
    
    for date in data:
        
        for doc in data[date]['docs']:
            
            doc['nodes'] = {}
            doc['edges'] = []
            
            for sentence in doc['sentences']:
                
                sentence = [ent['word'] for ent in sentence if ent['tag'] not in omissions]

                for ent in set(sentence):
                    if ent not in doc['nodes']:
                        doc['nodes'][ent] = {'weight': 0, 'neighbours': {}}

                ents = combinations(sentence, 2)

                for ent1, ent2 in ents:
                    if ent1 != ent2:
                        if ent2 in doc['nodes'][ent1]['neighbours']:
                            doc['nodes'][ent1]['neighbours'][ent2]['weight'] += 1
                        else:
                            edge = {'weight': 1, 'nodes': [ent1, ent2]}
                            doc['nodes'][ent1]['neighbours'][ent2] = edge
                            doc['nodes'][ent2]['neighbours'][ent1] = edge
                            doc['edges'].append(edge) 
    return data

def document_co_occurrence(omissions, data):
    
    for date in data:
        
        for doc in data[date]['docs']:
            
            doc_ents = []
            
            if 'nodes' not in doc:
                doc['nodes'] = {}
                doc['edges'] = []
                
            for sentence in doc['sentences']:
                doc_ents.extend(sentence)
                
            doc_ents = [ent['word'] for ent in doc_ents if ent['tag'] not in omissions]

            for ent in set(doc_ents):
                if ent not in doc['nodes']:
                    doc['nodes'][ent] = {'weight': 0, 'neighbours': {}}

            ents = combinations(doc_ents, 2)

            for ent1, ent2 in ents:
                if ent1 != ent2:
                    if ent2 in doc['nodes'][ent1]['neighbours']:
                        doc['nodes'][ent1]['neighbours'][ent2]['weight'] += 1
                    else:
                        edge = {'weight': 1, 'nodes': [ent1, ent2]}
                        doc['nodes'][ent1]['neighbours'][ent2] = edge
                        doc['nodes'][ent2]['neighbours'][ent1] = edge
                        doc['edges'].append(edge) 
    return data

def compute_significance(counter):
    sig = {}
    n_ent = sum(list(counter.values()))
    for entity in counter:
        sig[entity] = counter[entity] / n_ent
    return sig, n_ent

def entity_counter(ents, omissions):
    counter = {}
    for ent in ents:
        if ent['tag'] not in omissions:
            if ent['word'] in counter:
                counter[ent['word']] += 1
            else:
                counter[ent['word']] = 1
    return counter

def document_entities(data):
    
    for date in data:
        for doc in data[date]['docs']:
            doc['ents'] = []
            for sentence in doc['sentences']:
                for ent in sentence:
                    doc['ents'].append(ent)
    return data
                    

def aggregate_dictionary_nodes_and_edges(data):
    
    for date in data:
        nodes, edges = {}, []
        for doc in data[date]['docs']:
            for node in doc['nodes']:
                nodes[node] = 0
            edges.extend(doc['edges'])
        data[date]['nodes'] = nodes
        data[date]['edges'] = edges

    return data

def list_data_structure(data):
    output = []
    for date in data:
        for doc in data[date]['docs']:
            output.append(doc)
    return output

def strip_content(data):
    for date in data:
        docs = []
        for doc in data[date]['docs']:
            del doc['sentences']
            docs.append(doc)
            
        data[date]['docs'] = docs
    return data

def filter_dates(data, start='31 Dec 2016', end='01 Jan 2018'):
    
    start = dt.strptime(start, '%d %b %Y')
    end = dt.strptime(end, '%d %b %Y')
    
    output = {}
    
    for date in data:
        if start < date < end:
            output[date] = data[date]
    return output
    

In [5]:
folder = r'demo\output\person'

fnames = [
         r'\01 sent sig',
         r'\02 doc sig',
         r'\03 doc sent sig',
         r'\04 sent count',
         r'\05 doc count',
         r'\06 doc sent count',
         ]

omitted_tags = set(['location', 'organization'])

funcs = [
        compose(calculate_node_degree,
                aggregate_dictionary_nodes_and_edges,
                #curry(document_significance)(omitted_tags),
                curry(sentence_significance)(omitted_tags),
                #curry(document_co_occurrence)(omitted_tags),
                #curry(sentence_co_occurrence)(omitted_tags),
                filter_dates,
                unpack_input,
               ),

        compose(calculate_node_degree,
                       aggregate_dictionary_nodes_and_edges,
                       curry(document_significance)(omitted_tags),
                       #curry(sentence_significance)(omitted_tags),
                       #curry(document_co_occurrence)(omitted_tags),
                       #curry(sentence_co_occurrence)(omitted_tags),
                       filter_dates,
                       unpack_input,
               ),

        compose(calculate_node_degree,
                       aggregate_dictionary_nodes_and_edges,
                       curry(document_significance)(omitted_tags),
                       curry(sentence_significance)(omitted_tags),
                       #curry(document_co_occurrence)(omitted_tags),
                       #curry(sentence_co_occurrence)(omitted_tags),
                       filter_dates,
                       unpack_input,
               ),

        compose(calculate_node_degree,
                aggregate_dictionary_nodes_and_edges,
                #curry(document_significance)(omitted_tags),
                #curry(sentence_significance)(omitted_tags),
                #curry(document_co_occurrence)(omitted_tags),
                curry(sentence_co_occurrence)(omitted_tags),
                filter_dates,
                unpack_input,
               ),

        compose(calculate_node_degree,
                aggregate_dictionary_nodes_and_edges,
                #curry(document_significance)(omitted_tags),
                #curry(sentence_significance)(omitted_tags),
                curry(document_co_occurrence)(omitted_tags),
                #curry(sentence_co_occurrence)(omitted_tags),
                filter_dates,
                unpack_input,
               ),

        compose(calculate_node_degree,
                aggregate_dictionary_nodes_and_edges,
                #curry(document_significance)(omitted_tags),
                #curry(sentence_significance)(omitted_tags),
                curry(document_co_occurrence)(omitted_tags),
                curry(sentence_co_occurrence)(omitted_tags),
                filter_dates,
                unpack_input,
               ),
        ]

The below cell will generate graphs using Python data structures

In [6]:
for i, f in enumerate(fnames):
    data = load_pickle(r'data\wiki.pkl')
    data = funcs[i](data)
    save_pickle(folder + r'\graphs' + f + r' dic.pkl', data)
    lis = list_data_structure(data)
    del data
    save_pickle(folder + r'\graphs' + f + r' list.pkl', lis)
    del lis

Loading pickled data...
Pickling data...
Pickling data...
Loading pickled data...
Pickling data...
Pickling data...
Loading pickled data...
Pickling data...
Pickling data...
Loading pickled data...
Pickling data...
Pickling data...
Loading pickled data...
Pickling data...
Pickling data...
Loading pickled data...
Pickling data...
Pickling data...


In [10]:
data = load_pickle(r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\08 Network\demo\output\person\graphs\02 doc sig dic.pkl')

### Networkx graph stuff

In [6]:
def build_graph(data, use_node_weight=True):
    print('Building graph...')
    graph = nx.Graph()
    for node in data['nodes']:
        if use_node_weight:
            if node not in graph:
                graph.add_node(node, weight=data['nodes'][node]['weight'])
                graph.node[node]['viz'] = {'size': data['nodes'][node]['weight']}
            else:
                graph.node[node]['weight'] += data['nodes'][node]['weight']
                graph.node[node]['viz']['size'] += data['nodes'][node]['weight']
        else:
            if node not in graph:
                graph.add_node(node)
                
    for edge in tqdm(data['edges']):
        if graph.has_edge(*edge['nodes']):
            graph[edge['nodes'][0]][edge['nodes'][1]]['weight'] += edge['weight']
        else:
            graph.add_edge(*edge['nodes'], weight=edge['weight'])  
    return graph

def aggregate_list_nodes_and_edges(lis):
    output = {'nodes': {}, 'edges': []}
    for doc in lis:
        for node in doc['nodes']:
            if node in output['nodes']:
                output['nodes'][node] += node['weight']
            else:
                output[node] = node
        output['edges'].extend(doc['edges'])
    return output

def save_as_gexf(filename, graph):
    nx.write_gexf(graph, os.path.normpath(filename))
    
def save_as_graphml(filename, graph):
    nx.write_graphml(graph, os.path.normpath(filename))

In [7]:
def datetime_filter(data, start, end):
    
    def datetime_interpretor(*args, default_tzinfo=tz.gettz('UTC'), **kwargs):
        dt = parse(*args, **kwargs)[0]
        return dt.replace(tzinfo=dt.tzinfo or default_tzinfo)
    
    start = parse(start, fuzzy_with_tokens=True)[0]
    end = parse(end, fuzzy_with_tokens=True)[0]
    
    return [doc for doc in data if start <= doc['datetime'] < end]

In [8]:
def plot_hists(input_, colour='r', n_bins=21, use_logx=True, **kwargs): # Now takes a title arguement to enable story titles to stay with stories
    '''
    Takes a list of 1D numpy arrays and Plots histograms.
    
    Args:
        list_1D_arrays - a list of 1D numpy arrays.
        
    Returns:
        histograms     - will output a figure of all 1D arrays as histograms.
    '''
    
    style.use('seaborn')
    
    if n_bins > len(input_):
        n_bins = len(input_) - 2
    
    if use_logx:
        bins = logspace(log10(min(input_)), log10(max(input_)), n_bins)
                                                                                                                             # arguement list.
    else:
        bins = linspace(min(input_), max(input_), n_bins)
    
    fig = figure()
    
    ax = gca()
    
    n, bins, patches = hist(input_, 
                             bins,
                             #color=colour, 
                             #alpha=1,
                             #density=True, 
                             ec='k',
                             rwidth=0.9,
                             #histtype='bar', 
                             #facecolor='blue',
                             #log=True,
                            #label=''
                            )
    
    bin_centers = 0.5 * log10(bins[:-1] + bins[1:])
    
    col = bin_centers - min(bin_centers)
    col /= max(col)
    
    cm = plt.cm.get_cmap('twilight_shifted')

    for c, p in zip(col, patches):
        plt.setp(p, 'facecolor', cm(c))

#     fig.text(0.05, 0.5, 'Frequency', # Y-axis label
#              horizontalalignment='center',
#              verticalalignment='center', rotation=90)
    
#     fig.text(0.5, 0.01, 'Degree', # X-axis label
#              horizontalalignment='center',
#              verticalalignment='center')
    
    ax.set_xscale("log")
    ax.set_yscale("log")
    
    xticks(fontsize=14)
    yticks(fontsize=14)
    legend(title=kwargs.get('title', ''), title_fontsize=16)
    fname = kwargs.get('fname', None)
    if fname is not None:
        savefig(fname)
    #fig.legend(loc=(0.16, 0.72), fontsize=9, frameon=False)
    if kwargs.get('show', False):
        show()
        
def assign_partition(graph, partition):
    for node, cluster in partition.items():
        graph.node[node]['Cluster'] = cluster
    return graph

In [9]:
def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    mag1 = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    mag2 = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (mag1 * mag2)

In [14]:
labels = ['sentence significance', 
          'document significance', 
          'sentence and document significance',
          'sentence count',
          'document count', 
          'sentence and document count'
          ]

#### Build graphs and save as .graphml files

In [12]:
for fname in fnames:
    data = load_pickle(folder + r'\graphs' + fname + ' list.pkl')
    graph = build_graph(aggregate_list_nodes_and_edges(data), use_node_weight=False)
    del data
    save_as_graphml(folder + r'\graphml' + fname + r'.graphml', graph)
    del graph

Loading pickled data...
Building graph...


100%|██████████████████████████████████████████████████████████████████████| 132975/132975 [00:00<00:00, 152474.84it/s]


Loading pickled data...
Building graph...


100%|██████████████████████████████████████████████████████████████████████| 824427/824427 [00:04<00:00, 181758.84it/s]


Loading pickled data...
Building graph...


100%|██████████████████████████████████████████████████████████████████████| 824427/824427 [00:04<00:00, 181289.30it/s]


Loading pickled data...
Building graph...


100%|██████████████████████████████████████████████████████████████████████| 132975/132975 [00:00<00:00, 165186.22it/s]


Loading pickled data...
Building graph...


100%|██████████████████████████████████████████████████████████████████████| 824427/824427 [00:04<00:00, 187374.33it/s]


Loading pickled data...
Building graph...


100%|██████████████████████████████████████████████████████████████████████| 824427/824427 [00:04<00:00, 171947.25it/s]


#### This cell will run the 'effect of co-occurrence rule on degree' experiment

In [13]:
for fname, label in zip(fnames, labels):
    graph = ig.read(folder + r'\graphml' + fname + r'.graphml')
    plot_hists(
    array(graph.strength(weights=graph.es['weight'])), 
    use_logx=True, 
    fname=r'demo\figs' \
    + fname \
    + ' ' + ' '.join({'person', 'location', 'organization'} - omitted_tags)
    + ' (weighted).png',
    title=label)

No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.


#### The below cell will calculate the nodes with high degree centrality for each co-occurrence rule and save as a excel spreadsheet in the Networkx degree folder

In [18]:
t50_savenames = [folder + r'\degree%s top 50.csv' % f for f in fnames]
pkl_savenames = [folder + r'\degree%s degree.pkl' % f for f in fnames]

for fname, t50, pkl in zip(fnames, t50_savenames, pkl_savenames):
    graph = ig.read(folder + r'\graphml' + fname + r'.graphml')
    unweighted_degree = array(graph.degree())
    idx = argsort(-unweighted_degree)
    unweighted_degree = unweighted_degree[idx]
    unweighted_ents = array([i['id'] for i in list(graph.vs)])[idx]
    
    weighted_degree = array(graph.strength(weights=graph.es['weight']))
    idx = argsort(-weighted_degree)
    weighted_degree = weighted_degree[idx]
    weighted_ents = array([i['id'] for i in list(graph.vs)])[idx]
    
    df = pd.DataFrame([unweighted_ents[: 50], unweighted_degree[: 50], weighted_ents[: 50], weighted_degree[: 50]])
    df.T.to_csv(t50)
    save_pickle(pkl, pd.DataFrame([unweighted_ents, unweighted_degree, weighted_ents, weighted_degree]))
    del graph
    del idx
    del weighted_degree
    del unweighted_degree
    del weighted_ents
    del unweighted_ents
    del df

Pickling data...
Pickling data...
Pickling data...
Pickling data...
Pickling data...
Pickling data...


#### The below cell will calculate the nodes with high pagerank centrality for each co-occurrence rule and save as a excel spreadsheet in the Networkx page rank folder

In [13]:
t50_savenames = [folder + r'\page rank%s top 50.csv' % f for f in fnames]
pkl_savenames = [folder + r'\page rank%s page rank.pkl' % f for f in fnames]

for fname, t50, pkl in zip(fnames, t50_savenames, pkl_savenames):
    graph = ig.read(folder + r'\graphml' + fname + r'.graphml')
    unweighted_pagerank = array(graph.pagerank())
    idx = argsort(-unweighted_pagerank)
    unweighted_pagerank = unweighted_pagerank[idx]
    unweighted_ents = array([i['id'] for i in list(graph.vs)])[idx]
    
    weighted_pagerank = array(graph.pagerank(weights=graph.es['weight']))
    idx = argsort(-weighted_pagerank)
    weighted_pagerank = weighted_pagerank[idx]
    weighted_ents = array([i['id'] for i in list(graph.vs)])[idx]
    
    df = pd.DataFrame([unweighted_ents[: 50], unweighted_pagerank[: 50], weighted_ents[: 50], weighted_pagerank[: 50]])
    df.T.to_csv(t50)
    save_pickle(pkl, pd.DataFrame([unweighted_ents, unweighted_pagerank, weighted_ents, weighted_pagerank]))
    del graph
    del idx
    del weighted_pagerank
    del unweighted_pagerank
    del weighted_ents
    del unweighted_ents
    del df

Pickling data...
Pickling data...
Pickling data...
Pickling data...
Pickling data...
Pickling data...


In [14]:
t50_savenames = [folder + r'\eigen%s top 50.csv' % f for f in fnames]
pkl_savenames = [folder + r'\eigen%s eigen.pkl' % f for f in fnames]

for fname, t50, pkl in zip(fnames, t50_savenames, pkl_savenames):
    
    graph = ig.read(folder + r'\graphml' + fname + r'.graphml')
    unweighted_eigen = array(graph.evcent())
    idx = argsort(-unweighted_eigen)
    unweighted_eigen = unweighted_eigen[idx]
    unweighted_ents = array([i['id'] for i in list(graph.vs)])[idx]
    
    weighted_eigen = array(graph.evcent(weights=graph.es['weight']))
    idx = argsort(-weighted_eigen)
    weighted_eigen = weighted_eigen[idx]
    weighted_ents = array([i['id'] for i in list(graph.vs)])[idx]
    
    df = pd.DataFrame([unweighted_ents[: 50], unweighted_eigen[: 50], weighted_ents[: 50], weighted_eigen[: 50]])
    df.T.to_csv(t50)
    save_pickle(pkl, pd.DataFrame([unweighted_ents, unweighted_eigen, weighted_ents, weighted_eigen]))
    del graph
    del idx
    del weighted_eigen
    del unweighted_eigen
    del weighted_ents
    del unweighted_ents
    del df

Pickling data...
Pickling data...
Pickling data...
Pickling data...
Pickling data...
Pickling data...


In [15]:
fnames = [
          r'\01 sent sig',
          r'\03 doc sent sig',
          r'\04 sent count',
          r'\06 doc sent count',
         ]

folder = r'demo\output\person'

labels = ['sentence significance',
          'sentence and document significance',
          'sentence count',
          'sentence and document count'
          ]

data = []

In [16]:

fig, axs = subplots(4, 2, figsize=(9, 16), sharey=True)

for i, fname, lab in zip(range(4), fnames, labels):
    
    style.use('seaborn')
    graph = ig.read(folder + r'\graphml' + fname + r'.graphml')

    degrees = array(graph.strength())
    eigen = array(graph.evcent())
    axs[i, 0].scatter(degrees, eigen, s=3)
    axs[i, 0].legend(title=lab, title_fontsize=11)
    
    weighted_degrees = array(graph.strength(weights=graph.es['weight']))
    weighted_eigen = array(graph.evcent(weights=graph.es['weight']))
    #page_ranks = array(graph.pagerank())
    axs[i, 1].scatter(weighted_degrees, weighted_eigen, s=3)
    axs[i, 1].legend(title=lab + ' (weighted)', title_fontsize=12)
    
    #data.append([[degrees, eigen], [weighted_degrees, weighted_eigen]])

subplots_adjust(wspace=0.1, hspace=0.2)
ylabel('eigenvector centrality', fontsize=14)
xlabel('degree', fontsize=14)
gca().yaxis.set_label_coords(-1.23, 2.2)
gca().xaxis.set_label_coords(-0.03, -0.3)


No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.


In [None]:

fig, axs = subplots(4, 2, figsize=(9, 16), sharey=True)

for i, fname, lab in zip(range(4), fnames, labels):
    
    style.use('seaborn')
    graph = ig.read(folder + r'\graphml' + fname + r'.graphml')
    
    weighted_degrees = array(graph.strength(weights=graph.es['weight']))
    weighted_eigen = array(graph.evcent(weights=graph.es['weight']))
    #page_ranks = array(graph.pagerank())
    axs[i, 0].scatter(data[i][0][0], data[i][0][1], s=3)
    axs[i, 0].legend(title=lab, title_fontsize=12)

    degrees = array(graph.strength())
    eigen = array(graph.evcent())
    axs[i, 1].scatter(data[i][1][0], data[i][1][1], s=3)
    axs[i, 1].legend(title=lab + ' (weighted)', title_fontsize=11)
    
    #data.append([[weighted_degrees, weighted_eigen], [degrees, eigen]])

subplots_adjust(wspace=0.1, hspace=0.2)
ylabel('eigenvector centrality', fontsize=14)
xlabel('degree', fontsize=14)
gca().yaxis.set_label_coords(-1.23, 2.2)
gca().xaxis.set_label_coords(-0.03, -0.3)


In [None]:
fnames = [
         r'\01 sent sig list per org',
         r'\02 doc sig list per org',
         r'\03 doc sent sig list per org',
         r'\04 sent count list per org',
         r'\05 doc count list per org',
         r'\06 doc sent count list per org',
         ]


labels = ['sentence significance', 
          'document significance', 
          'sentence and document significance',
          'sentence count',
          'document count', 
          'sentence and document count'
          ]

In [None]:
for fname in fnames:
    data = load_pickle(r'data' + fname + '.pkl')
    graph = build_graph(aggregate_list_nodes_and_edges(data), use_node_weight=False)
    del data
    save_as_graphml(r'graphml' + fname + r'.graphml', graph)
    del graph

In [None]:
for fname, label in zip(fnames, labels):
    graph = ig.read(r'graphml' + fname + r'.graphml')
    graph.es.select(weight=0).delete()
    plot_hists(
    array(graph.degree()), 
    use_logx=True, 
    fname=r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Thesis__\Figures\Degree co ocurrence rule' \
    + fname \
    + '.png',
    title=label)

In [None]:
graph = ig.read(r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\11 Timeseries\output\person\graphml\01 sent sig.graphml')

In [16]:
stop_ents = {'Donald Trump', 'Spot Development', 'United States', 'Ramadan'}

In [None]:
eigen = array(graph.evcent())
ents = array([i['id'] for i in list(graph.vs)])

In [None]:
key = {k: v for k, v in zip(ents, eigen)}

In [None]:
data = load_pickle(
r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\11 Timeseries\output\person\graphs\01 sent sig dic.pkl')

In [None]:
del data
del graph

#### The below code is for thresholding / removing trends

In [17]:
def load_graph_data(o):
    with open(os.path.normpath(o['graph_data_path']), 'rb') as open_file:
        o['input_data'] = pickle.load(open_file)
        return o 

In [18]:
def harvest_unique_entities(o):
    print('Harvesting nodes...')
    o['entity_set'] = set()
    progress = tqdm(range(len(o['input_data'])))
    for date in o['input_data']:
        for node in o['input_data'][date]['nodes']:
            o['entity_set'].add(node)
        progress.update(1)
    o['entity_set'] = list(o['entity_set'])
    return o

In [19]:
def create_entity_key(o):
    o['ent_key'] = {ent: i for i, ent in enumerate(o['entity_set'])}
    return o

In [20]:
def create_timeseries(o):
    print('Building 2d array...')
    o['arr'] = zeros((len(o['entity_set']), 365))
    o['dates'] = sorted(list(o['input_data'].keys()))
    for i in tqdm(range(365)):
        for ent in o['input_data'][o['dates'][i]]['nodes']:
            o['arr'][o['ent_key'][ent], i] = o['input_data'][o['dates'][i]]['nodes'][ent]
    del o['input_data']
    return o

In [21]:
def change(o):
    print('Calculating absolute change over time...')
    o['arr'] = abs(o['arr'][:, o['change_width']: ] - o['arr'][:, : -o['change_width']])
    return o

In [22]:
def rolling_window(o):
    print('Applying rolling functions...')
    shape = o['arr'].shape[:-1] + (o['arr'].shape[-1] - o['window_size'] + 1, o['window_size'])
    strides = o['arr'].strides + (o['arr'].strides[-1],)
    o['rolling_window'] = np.lib.stride_tricks.as_strided(o['arr'], shape=shape, strides=strides)
    return o

In [23]:
def map_functions(o):
    o['xbar'] = mean(o['rolling_window'], -1)
    o['sigma'] = std(o['rolling_window'], -1)
    return o

In [24]:
def apply_threshold(o):
    print('Applying threshold...')
    o['output'] = abs(o['arr'] - o['xbar'])
    o['output'][o['output'] <= o['sigma'] * o['threshold']] = 0.0
    return o

In [25]:
def offset_dates(o):
    o['dates'] = o['dates'][o['window_size'] - 1: ]
    return o

def offset_timeseries(o):
    o['arr'] = o['arr'][:, o['window_size'] - 1:]
    return o

In [26]:
def invert_ent_key(o):
    for ent in list(o['ent_key'].keys()):
        o['ent_key'][o['ent_key'][ent]] = ent
    return o

In [27]:
def harvest_peaking_entities(o):
    temp = o['output'].copy()
    o['peak_depth'] = o.get('peak_depth', 1)
    o['peaking_entities'] = {date: [] for date in o['dates']}
    print('Harvesting peaking entities...')
    for depth in range(1, o['peak_depth'] + 1):
        print('Peak depth: %d' % depth)
        peaking_idx = argmax(temp, axis=0)
        for j in tqdm(range(len(o['dates']))):
            if temp[peaking_idx[j], j] > 0.0:
                o['peaking_entities'][o['dates'][j]].append(o['ent_key'][peaking_idx[j]])     
        temp[peaking_idx] = 0.0
    return o

In [28]:
def harvest_peaking_entities(o):
    temp = o['output'].copy()
    o['peak_depth'] = o.get('peak_depth', 1)
    o['peaking_entities'] = {date: [] for date in o['dates']}
    print('Harvesting peaking entities...')
    for depth in range(1, o['peak_depth'] + 1):
        print('Peak depth: %d' % depth)
        peaking_idx = argmax(temp, axis=0)
        for j in tqdm(range(len(o['dates']))):
            if temp[peaking_idx[j], j] > 0.0:
                o['peaking_entities'][o['dates'][j]].append(o['ent_key'][peaking_idx[j]])     
        temp[peaking_idx] = 0.0
    return o

def prune_documents(o):
    with open(os.path.normpath(o['document_data_path']), 'rb') as open_file:
        data = pickle.load(open_file)
    progress = tqdm(range(len(data)))
    print('Pruning documents...')
    pruned_data = []
    for doc in data:
        date = doc['datetime']
        if date in o['peaking_entities']:
            for ent in o['peaking_entities'][date]:
                if ent in doc['nodes']:
                    pruned_data.append(doc)
                    break
        progress.update(1)
    print('Saving...')
    with open(os.path.normpath(o['where_to_save']), 'wb') as open_file:
        pickle.dump(pruned_data, open_file)
    print('Done.')
    return o

#### Apply threshold to graph 

In [29]:
graph_data_path = r'output\all\graphs\03 doc sent sig dic.pkl'
document_data_path = r'output\all\graphs\03 doc sent sig list.pkl'
where_to_save = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\08 Network\data\wiki doc sent sig 1.9.pkl'

In [31]:
# If you want to see the effect of applying different thresholds

mini_pipeline = compose(
                        apply_threshold,
                        offset_dates,
                        offset_timeseries,
                        map_functions,
                        rolling_window,
                        create_timeseries,
                        create_entity_key,
                        harvest_unique_entities,
                        load_graph_data,
                        )

# Will apply threshold and prune documents

full_pipeline = compose(
                        prune_documents,
                        harvest_peaking_entities,
                        invert_ent_key,
                        apply_threshold,
                        offset_dates,
                        offset_timeseries,
                        map_functions,
                        rolling_window,
                        create_timeseries,
                        create_entity_key,
                        harvest_unique_entities,
                        load_graph_data,
                        )


In [32]:
o = full_pipeline(
                  dict(
                       graph_data_path=graph_data_path,
                       document_data_path=document_data_path,
                       where_to_save=where_to_save,
                       window_size=7, # Size of the window we will consider
                       threshold=1.9, # Standard deviations. x > 2 = < 5% of normal distribution
                       peak_depth=1,
                       ),
                  )

Harvesting nodes...


100%|███████████████████████████████████████████████████████████████████████████████| 365/365 [00:00<00:00, 943.03it/s]


Building 2d array...


100%|███████████████████████████████████████████████████████████████████████████████| 365/365 [00:01<00:00, 337.20it/s]


Applying rolling functions...
Applying threshold...
Harvesting peaking entities...
Peak depth: 1


100%|████████████████████████████████████████████████████████████████████████████| 359/359 [00:00<00:00, 179598.66it/s]
  0%|                                                                                        | 0/59575 [00:00<?, ?it/s]

Pruning documents...


 56%|████████████████████████████████████████▍                               | 33497/59575 [00:00<00:00, 332398.65it/s]

Saving...


100%|████████████████████████████████████████████████████████████████████████| 59575/59575 [00:22<00:00, 332398.65it/s]

Done.


In [33]:
# del o

In [35]:
def plot_entity_timeseries(o, entity, *args, **kwargs):
    xformat = kwargs.get('xformat', '%b')
    style.use(kwargs.get('style', 'seaborn'))
    figure(figsize=(10, 6))
    [plot(o['dates'], o[arg][o['ent_key'][entity]] / max(o[arg][o['ent_key'][entity]])) for arg in args]
    ax = gca()
    setp(ax.xaxis.get_majorticklabels(), rotation=45)
    ax.xaxis.set_major_formatter(DateFormatter(xformat))
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.tick_params(axis='both', which='minor', labelsize=15)
    ax.set_xlabel(kwargs.get('xlabel', 'Date (2017)'), fontsize=19)
    ax.set_ylabel(kwargs.get('ylabel', 'Weighted Degree'), fontsize=19)
    show()

In [36]:
def create_date_range(start, end):
    start = dt.strptime(start, '%d %b %Y').date()
    end = dt.strptime(end, '%d %b %Y').date()
    step = timedelta(days=1)
    output = []
    while start < end:
        output.append(start)
        start += step
    return sorted(output)

In [37]:
def create_gtd_timeseries(data, key, val, start='01 Jan 2017', end='01 Jan 2018'):
    dt_range = create_date_range(start, end)
    dt_key = {date: i for i, date in enumerate(dt_range)}
    output = zeros(len(dt_key))
    for date in data:
        for e in data[date]:
            if e[key] == val:
                output[dt_key[date]] += 1
    return output

In [38]:

def gtd_comparison_entity_timeseries(o, tag, gtd_ent, ent, *args, **kwargs):
    
    xformat = kwargs.get('xformat', '%b')
    style.use(kwargs.get('style', 'default'))
    figure(figsize=(10, 6))
    plot(o['dates'], o['output'][o['ent_key'][ent]], c='k', label='weighted degree')
    ax1 = gca()
    setp(ax1.xaxis.get_majorticklabels(), rotation=45)

    ax1.tick_params(axis='both', which='major', labelsize=15)
    ax1.tick_params(axis='both', which='minor', labelsize=15)
    ax1.set_xlabel(kwargs.get('xlabel', 'Date (2017)'), fontsize=19)
    ax1.set_ylabel(kwargs.get('y1label', 'Weighted degree'), fontsize=19)
    ax1.yaxis.label.set_color('k')
    
    data = \
    load_pickle(r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\GTD\CritIII all regions GTD 1.pkl')
    
    start = kwargs.get('start', '01 Jan 2017')
    end = kwargs.get('end', '01 Jan 2018')

    gtd_timeseries = create_gtd_timeseries(data, tag, gtd_ent, start, end)
    dt_range = create_date_range(start, end)
    dt_key = {date: i for i, date in enumerate(dt_range)}
    
    ax2 = gca().twinx()
    ax2.plot(dt_range, gtd_timeseries, c='r', alpha=0.5)
    ax2.set_ylabel(kwargs.get('y2label', 'No. of terror events'), fontsize=19)
    ax2.yaxis.set_label_coords(1.05, 0.5)
    ax2.tick_params(axis='y', which='major', labelsize=15)
    ax2.locator_params(nbins=2)
    ax2.yaxis.label.set_color('r')
    ax2.xaxis.set_major_formatter(DateFormatter(xformat))
    gcf().subplots_adjust(bottom=0.15)
    show()

#ax.tick_params(axis='both', which='major', labelsize=15)
#ax.tick_params(axis='both', which='minor', labelsize=15)
# gca().set_xlabel('Date (2017)', fontsize=12)
# gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
#gca().locator_params(nbins=4)
# show()

In [39]:
gtd_comparison_entity_timeseries(o, 'city', 'London', 'London')


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


Loading pickled data...




In [40]:
gtd_comparison_entity_timeseries(o, 'city', 'Kabul', 'Kabul')

Loading pickled data...




In [41]:
f = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Thesis__\Figures\Networks\london bridge.graphml'

save_as_graphml(
    f,
    build_graph(
    aggregate_list_nodes_and_edges(
        datetime_filter(
            load_pickle(where_to_save),
            '03 Jun 2017', '04 Jun 2017')), use_node_weight=False))

Loading pickled data...
Building graph...



  0%|                                                                                         | 0/1666 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1666/1666 [00:00<00:00, 89706.79it/s]

In [42]:
def create_key(partition):
    key = {}

    for ent, p in partition.items():
        if p not in key:
            key[p] = [ent]
        else:
            key[p].append(ent)
    return key

def remove_stop_ents(igraph, stop_ents):
    
    keys = [(v['id'], v.index) for v in igraph.vs]
    keys = list(sorted(keys, key=lambda x: x[1], reverse=True))
    
    for ent, idx in keys:
        if ent in stop_ents:
            igraph.delete_vertices(idx)
            
    return igraph
    
    

In [43]:
def eigen_partition(f,
                    use_weights=False,
                    stop_ents=None):

    igraph = ig.read(f)
    
    igraph = remove_stop_ents(igraph, stop_ents)
    
    if use_weights:
        weights = igraph.es['weight']
    else:
        weights = None
        
    temp = igraph.community_leading_eigenvector(weights=weights)

    partition = {}
    
    for cluster, lis in enumerate(temp):
        for idx in lis:
            partition[igraph.vs['id'][idx]] = cluster

    return igraph, partition
    

def get_tops(f, 
             n, 
             n_best,
             stop_ents=None,
             use_part_weights=False,
             use_cent_weights=False,
             ):

    
    igraph, partition = eigen_partition(f, use_weights=use_part_weights, stop_ents=stop_ents)
    
    counts = Counter(partition.values())
    
    sum_ = sum(list(counts.values()))
    
    counts = {k: v / sum_ * 100 for k, v in counts.items()}.items()
    
    key = create_key(partition)
    
    if use_cent_weights:
        weights = igraph.es['weight']
    else:
        weights = None
    
    lookup = array(igraph.evcent(weights=weights))

    lookup = {k['id']: e for k, e in zip(igraph.vs, igraph.evcent(weights=weights))}
    
    maxi = {}
    
    for _ in range(n):
        for p in key:
            if key[p]:
                m = key[p][0]
                idx = {ent['id']: i for ent, i in zip(igraph.vs, range(len(list(igraph.vs))))}
                for ent in key[p][1:]:
                    if ent in lookup:
                        if lookup[ent]> lookup[m]:
                            m = ent
                if p in maxi:
                    maxi[p].append(m)
                else:
                    maxi[p] = [m]
                igraph.delete_vertices(idx[m])
                key[p].remove(m)
                
    best = list(sorted(counts, key=lambda x: x[1], reverse=True))
                
    return [(i[1], maxi[i[0]]) for i in best[:n_best]] 


In [44]:
tops = get_tops(f, 10, 10, use_part_weights=True, use_cent_weights=True, stop_ents=stop_ents)

In [45]:
for i in tops:
    print(i[0], *(j for j in i[1]), sep=', ')

26.21359223300971, London Ambulance Service, Bridge, Mark Rowley, Bridge Tube, Borough Market, Metropolitan Police, British Transport Police, Westminster, Metropolitan Police Service, Sadiq Khan
18.446601941747574, Emmanuel Macron, Us Central Intelligence Agency, Iran, Wall Street Journal, Ayatollah Ruhollah Khomeini, Ayatollah Khomeini, Michael D'andrea, Saudi Arabia, Fethullah Gulen, CIA
17.475728155339805, ISIL, London, Al Qaida, Ariana Grande, Osama Bin Laden, Omar Mateen, Jose Aznar, George W Bush, Middle East, Paris
16.50485436893204, Iraq, Mosul, Syria, Raqqa, Syrian Observatory For Human Rights, Us Backed, SDF, United Nations, Cihan Sheikh Ehmed, Aamaq
12.62135922330097, Narendra Modi, North Atlantic Treaty Organization, Kabul, Jens Stoltenberg, Muhammad Ashraf Ghani, Jalaluddin Haqqani, Rahmatullah Nabil, Anas Haqqani, Ataollah Khogiani, Nangarhar
5.825242718446602, Theresa May, West, Khalid Masood, Midlands, Conservative Party, Labour Party
2.912621359223301, Manchester, Engl

In [46]:
def netx_get_tops(f, 
             n, 
             n_best,
             start, end,
             stop_ents=None,
             use_cent_weights=False,
             ):
    
    
    netx = build_graph(
            aggregate_list_nodes_and_edges(
                datetime_filter(
                    load_pickle(where_to_save),
                    start, end)), use_node_weight=False)
    
    netx.remove_nodes_from(stop_ents)
    
    save_as_graphml(r'temp\temp.graphml', netx)
    
    partition = community.best_partition(netx)
    
    del netx
    
    igraph = ig.read(r'temp\temp.graphml')
    
    counts = Counter(partition.values())
    
    sum_ = sum(list(counts.values()))
    
    counts = {k: v / sum_ * 100 for k, v in counts.items()}.items()
    
    key = create_key(partition)
    
    if use_cent_weights:
        weights = igraph.es['weight']
    else:
        weights = None
    
    lookup = array(igraph.strength(weights=weights))

    lookup = {k['id']: e for k, e in zip(igraph.vs, lookup)}
    
    maxi = {}
    
    for _ in range(n):
        for p in key:
            if key[p]:
                m = key[p][0]
                idx = {ent['id']: i for ent, i in zip(igraph.vs, range(len(list(igraph.vs))))}
                for ent in key[p][1:]:
                    if ent in lookup:
                        if lookup[ent]> lookup[m]:
                            m = ent
                if p in maxi:
                    maxi[p].append(m)
                else:
                    maxi[p] = [m]
                igraph.delete_vertices(idx[m])
                key[p].remove(m)
                
    best = list(sorted(counts, key=lambda x: x[1], reverse=True))
                
    return [(i[1], maxi[i[0]]) for i in best[:n_best]] 


In [47]:
start, end = '03 Jun 2017', '04 Jun 2017'

tops = netx_get_tops(where_to_save, 10, 10, start, end, stop_ents=stop_ents, use_cent_weights=True)
for i in tops:
    print(i[0], *(j for j in i[1]), sep=', ')
    

Loading pickled data...
Building graph...



  0%|                                                                                         | 0/1666 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1666/1666 [00:00<00:00, 111880.34it/s]

29.126213592233007, London Ambulance Service, Theresa May, Borough Market, Bridge, Westminster, Mark Rowley, Metropolitan Police, United Kingdom, Bridge Tube, British Transport Police
23.300970873786408, ISIL, West, Al Qaida, Manchester, Jose Aznar, England, Omar Mateen, Osama Bin Laden, Paris, London
20.388349514563107, Iran, Us Central Intelligence Agency, Kabul, Wall Street Journal, Ayatollah Ruhollah Khomeini, Saudi Arabia, Fethullah Gulen, Turkey, Muhammad Ashraf Ghani, CIA
15.53398058252427, Raqqa, Syria, Syrian Observatory For Human Rights, Iraq, United Nations, SDF, Us Backed, Mosul, Cihan Sheikh Ehmed, Euphrates
11.650485436893204, Narendra Modi, North Atlantic Treaty Organization, Jalaluddin Haqqani, Emmanuel Macron, Rahmatullah Nabil, Anas Haqqani, Jens Stoltenberg, Ataollah Khogiani, Nangarhar, Jalalabad


In [37]:
def eigen_cluster(fname, start='31 dec 2016', end='01 Jan 2018', use_weights=False):
    
    netx = build_graph(
        aggregate_list_nodes_and_edges(
            datetime_filter(
                load_pickle(fname),
                start, end)), use_node_weight=False)
    
    save_as_graphml(r'temp\temp.graphml', netx)
    
    del netx
    
    igraph = ig.read(r'temp\temp.graphml')
    
    if use_weights:
        weights = igraph.es['weight']
    else:
        weights = None
        
    partition = igraph.community_leading_eigenvector(weights=weights)
    
    key = {}
    
    for cluster, lis in enumerate(partition):
        for ent in lis:
            key[igraph.vs['id'][ent]] = cluster
            
    del igraph
            
    netx = nx.read_graphml(r'temp\temp.graphml')

    for ent, cluster in key.items():
        netx.node[ent]['partition'] = cluster

    return netx
    

In [89]:
del o

In [53]:
def gtd_doc_count_comparison(data, 
                             use_rolling_mean=False, 
                             window_size=5, 
                             start='01 Jan 2017',
                             end='01 Jan 2018',
                             **kwargs):
    
    def rolling_mean(arr, window_size=window_size):
        shape = arr.shape[:-1] + (arr.shape[-1] - window_size + 1, window_size)
        strides = arr.strides + (arr.strides[-1],)
        return mean(np.lib.stride_tricks.as_strided(arr, shape=shape, strides=strides), axis=1)
    
    xformat = kwargs.get('xformat', '%b')
    style.use(kwargs.get('style', 'default'))
    
    figure(figsize=(10, 6))
    
    dt_range = create_date_range(start, end)
    dt_key = {date: i for i, date in enumerate(dt_range)}
    
    data_timeseries = zeros(len(dt_range))
    
    for doc in data:
        if doc['datetime'].date() in dt_key:
            data_timeseries[dt_key[doc['datetime'].date()]] += 1
            
    if use_rolling_mean:
        data_timeseries = rolling_mean(data_timeseries)
        offset = window_size - 1
    else:
        offset = 0
    
    plot(dt_range[offset:], data_timeseries, c='k', label='No. of documents')
    
    ax1 = gca()
    setp(ax1.xaxis.get_majorticklabels(), rotation=45)

    ax1.tick_params(axis='both', which='major', labelsize=15)
    ax1.tick_params(axis='both', which='minor', labelsize=15)
    ax1.set_xlabel(kwargs.get('xlabel', 'Date (2017)'), fontsize=19)
    ax1.set_ylabel(kwargs.get('y1label', 'No. of documents'), fontsize=19)
    ax1.yaxis.label.set_color('k')
    
    gtd_data = \
    load_pickle(r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\GTD\CritIII all regions GTD 1.pkl')
    
    start = kwargs.get('start', '01 Jan 2017')
    end = kwargs.get('end', '01 Jan 2018')

    gtd_timeseries = zeros(len(dt_range))

    for date in dt_range:
        for e in gtd_data[date]:
            gtd_timeseries[dt_key[date]] += 1
            
    if use_rolling_mean:
        gtd_timeseries = rolling_mean(gtd_timeseries)

    ax2 = gca().twinx()
    ax2.plot(dt_range[offset:], gtd_timeseries, c='r')
    ax2.set_ylabel(kwargs.get('y2label', 'No. of terror events'), fontsize=19)
    ax2.yaxis.set_label_coords(1.05, 0.5)
    ax2.tick_params(axis='y', which='major', labelsize=15)
    ax2.locator_params(nbins=2)
    ax2.yaxis.label.set_color('r')
    ax2.xaxis.set_major_formatter(DateFormatter(xformat))
    ax2.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
    gcf().subplots_adjust(bottom=0.15)
    #legend()
    show()

In [54]:
data = load_pickle(r'data\wiki.pkl')

Loading pickled data...


In [55]:
gtd_doc_count_comparison(data)

Loading pickled data...




In [56]:
gtd_doc_count_comparison(data, use_rolling_mean=True, window_size=14)

Loading pickled data...




In [49]:
gtd_doc_count_comparison(load_pickle(where_to_save), use_rolling_mean=True, window_size=15)

Loading pickled data...
Loading pickled data...




In [50]:
gtd_doc_count_comparison(load_pickle(where_to_save))

Loading pickled data...
Loading pickled data...




In [51]:
graph = ig.read(r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\11 Timeseries\output\person\graphml\01 sent sig.graphml')
unweighted_eigen = array(graph.evcent())
idx = argsort(-unweighted_eigen)
unweighted_eigen = unweighted_eigen[idx]
unweighted_ents = array([i['id'] for i in list(graph.vs)])[idx]

weighted_eigen = array(graph.evcent(weights=graph.es['weight']))
idx = argsort(-weighted_eigen)
weighted_eigen = weighted_eigen[idx]
weighted_ents = array([i['id'] for i in list(graph.vs)])[idx]

df1 = pd.DataFrame([unweighted_ents[: 50], unweighted_eigen[: 50]])
df3 = pd.DataFrame([unweighted_ents[: 50], unweighted_eigen[: 50], weighted_ents[: 50], weighted_eigen[: 50]])
df1.T.to_csv(r'temp\t501.csv')
df3.T.to_csv(r'temp\t503.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Simon\\OneDrive - University of Exeter\\__Project__\\11 Timeseries\\output\\person\\graphml\\01 sent sig.graphml'

In [6]:
data = load_pickle(r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\07 Network\Resolved 4.pkl')

Loading pickled data...


In [3]:
stop_ents = {'Donald Trump', 'Ramadan', 'United States'}