In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import import_ipynb
from louvain_ascendency import*
from mixer_ascendency import*



importing Jupyter notebook from louvain_ascendency.ipynb
importing Jupyter notebook from mixer_ascendency.ipynb


In [24]:
class graph_analyser():
    
    def __init__(self, conn, graph_name):
        self.conn = conn
        self.addresses_in_large_communities = {} # communitity id => addresses in community
        self.large_communities = [] # stores ids of large communities
        self.subgraphs = {} # adjacency matrices of subgraphs of all communities 
        self.metrics_by_community = {} # dict of dicts of graph state metrics by community
        self.metric_names = ["Average Mutual Information", "Development capacity", "Ascendency", "Redundancy"]
        self.address_to_node_by_subgraph = {} # dicts of address to node in each subgraph
        self.graph_name = graph_name 
        self.remove_full_graph_from_neo4j()
        self.create_token_graph_neo4j()
        self.centrality_scores_by_community = {}
        self.get_full_graph_adj_matrix()
        
    def create_token_graph_neo4j(self):
        name = self.graph_name 
        print(name)
        
        query_string = '''CALL gds.graph.create(
                            '%s',
                            'Address',
                            {
                                TRANSFER_%s: {
                                    type: 'TRANSFERED_%s',
                                    orientation: 'NATURAL',
                                    properties: 'value'

                                }
                            }
                        )''' %(name, name, name)
        self.conn.query(query_string)
       

        
    def get_full_graph_adj_matrix(self):
        query_string = '''MATCH (n1:Address)-[r:TRANSFERED_%s]->(n2:Address)
                          RETURN n1.id AS From, n2.id AS To, r.value AS value''' %(self.graph_name)
        
        full_graph = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])
        self.full_graph_adj_matrix = self.pandas_edge_list_to_adj_matrix(full_graph, "full")
        
    def check_if_graph_exists_neo4j(self):
        query_string = "CALL gds.graph.exists('%s')"%(self.graph_name)
        exists = self.conn.query(query_string)
        response_df = pd.DataFrame([dict(_) for _ in exists])
        
        return response_df['exists'][0]
    
    def run_louvain(self, threshold):
        
        query_string = '''CALL gds.louvain.write(
                        '%s',
                        {
                            tolerance: %d,
                            writeProperty: 'communityId',
                            relationshipWeightProperty: 'value'
                        }
                    )'''%(self.graph_name, threshold)
        self.conn.query(query_string)
        
    def create_subgraph_from_community_id(self, community_id):
        query_string = '''CALL gds.graph.create.cypher(
                        'community-%d',
                        'MATCH (n) WHERE n.communityId = %d RETURN id(n) AS id',
                        'MATCH (n)-[r:TRANSFERED_%s]->(m) WHERE n.communityId = %d AND m.communityId = %d RETURN id(n) AS source, id(m) AS target, r.value AS value'
                    ) ''' %(community_id, community_id, self.graph_name, community_id, community_id)
        self.conn.query(query_string)
        
    def calculate_subgraph_centrality(self, community_id):
        query_string = '''CALL gds.degree.stream('community-%d')
                            YIELD nodeId, score
                            RETURN gds.util.asNode(nodeId).id AS address, score AS degreeCentrality
                            ORDER BY degreeCentrality DESC''' %(community_id)
        centrality_scores = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])
        
        return centrality_scores
    
    def get_node_community_id(self, address):
        
        query_string = '''MATCH (n) WHERE n.id = "%s" RETURN n.communityId as community''' %(address)
        community_id = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])
        return community_id["community"]
    
    def remove_all_community_ids(self):
        
        query_string = '''MATCH (n:Address)
                            REMOVE n.communityId
                            RETURN n '''
        self.conn.query(query_string)
    
    def calculate_centrality_score_for_every_community(self):
        
        for community_id in self.large_communities:
            self.create_subgraph_from_community_id(community_id)
            self.centrality_scores_by_community[community_id] = self.calculate_subgraph_centrality(community_id)
            self.remove_graph_from_neo4j(community_id)
            
    def calculate_degree_centrality_for_all_nodes(self):
        
            
        query_string = '''CALL gds.degree.stream('%s')
                            YIELD nodeId, score
                            RETURN gds.util.asNode(nodeId).id AS address, score AS degreeCentrality
                            ORDER BY degreeCentrality DESC''' %(self.graph_name)
        
        self.centrality_score_by_node = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])
        
    def calculate_pageRank_centrality_for_all_nodes(self):
        
        query_string = '''CALL gds.pageRank.stream('%s')
                            YIELD nodeId, score
                            RETURN gds.util.asNode(nodeId).id AS address, score AS degreeCentrality
                            ORDER BY degreeCentrality DESC''' %(self.graph_name)
        
        self.pageRank_centrality_score_by_node = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])
        
    def remove_graph_from_neo4j(self, community_id):
        
        query_string = '''CALL gds.graph.drop('community-%d')''' %(community_id)
        self.conn.query(query_string)
        
    def remove_full_graph_from_neo4j(self):
        query_string = '''CALL gds.graph.drop('%s')''' %(self.graph_name)
        self.conn.query(query_string)
        
    def find_large_communities(self, threshold):
        query_string = '''MATCH (n:Address)
                        WHERE EXISTS(n.communityId)
                        RETURN n.communityId AS communityId, COUNT(*) AS size
                        ORDER BY size DESC'''

        size_of_communities = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])
        
        for index, row in size_of_communities.iterrows():
            if(row['size'] > threshold):
                self.large_communities.append(row['communityId'])
        return self.large_communities
    
    def get_all_community_addresses(self, community_id):
        query_string = '''MATCH (n)
                          WHERE n.communityId = ''' 
        query_string = query_string + str(community_id) 
        query_string += '''RETURN n.id as id'''

        addresses_in_community = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])
        return addresses_in_community['id'].tolist()
    
    def get_all_addresses_from_large_communities(self):
        
        for community in self.large_communities:
            self.addresses_in_large_communities[community] = self.get_all_community_addresses(community)
        
    def get_node_degree(self, node_address):
    
        query_string = '''MATCH (n:Address {id: \"'''
        query_string += str(node_address)
        query_string +=  '"})'
        query_string += '''RETURN apoc.node.degree(n) AS degree '''

        degree = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])

        return degree['degree'][0]
    
    def get_address_with_highest_degree(self, community_id):
        a = "0x"
        highest_degree = 0
        for address in self.large_communities[community_id]:
            node_degree = self.get_node_degree(address, self.conn)
            if node_degree > highest_degree:
                highest_degree = node_degree
                a = address

        return a, highest_degree
    
    def get_subgraph_by_community_id(self, community_id):
        
        query_string = '''MATCH (a1:Address)-[r:TRANSFERED_%s]->(a2:Address)''' %(self.graph_name)
        query_string += '''WHERE a1.communityId = ''' + str(community_id) 
        query_string +=  '''AND a2.communityId =''' + str(community_id)
        query_string += '''RETURN a1.id as From, a2.id as To, r.value as value'''

        subgraph = pd.DataFrame([dict(_) for _ in self.conn.query(query_string)])
        
        return subgraph
    '''
    def convert_subgraph_into_adj_matrix(self, subgraph):
        
        Graphtype = nx.DiGraph()
        G=nx.from_pandas_edgelist(subgraph, 'From', 'To', ['value'], create_using=Graphtype)
        g_numpy = nx.to_numpy_array(G)
        
        return g_numpy 
    '''
    def calculate_metrics(self, subgraph_adj_matrix):
       
        ami = measure_average_mutual_information(subgraph_adj_matrix)
        dc = measure_development_capacity(subgraph_adj_matrix)
        a = measure_ascendency(subgraph_adj_matrix)
        r = measure_redundancy(subgraph_adj_matrix)
        
        return ami, dc, a, r
    
    def find_all_subgraphs(self):
        
        for community_id in self.large_communities:
            graph_df = self.get_subgraph_by_community_id(community_id)
            self.subgraphs[community_id] = self.pandas_edge_list_to_adj_matrix(graph_df, community_id)
        
        
    def calculate_metrics_for_all_subgraphs(self):
        
        for community_id in self.subgraphs:
            self.metrics_by_community[community_id] ={}
            metrics = self.calculate_metrics(self.subgraphs[community_id])
            for count, metric in enumerate(metrics):
                self.metrics_by_community[community_id][self.metric_names[count]] = metric
    
    def pandas_edge_list_to_adj_matrix(self, graph_df, community_id):
        
        address_to_node ={}
        count = 0
        for index, row in graph_df.iterrows():
            address = row["From"]
            if address not in address_to_node:
                address_to_node[address] = count 
                count += 1
            address = row["To"]
            if address not in address_to_node:
                address_to_node[address] = count 
                count += 1
        size = len(address_to_node.keys())
        
        adj_matrix = np.zeros((size, size))
        self.address_to_node_by_subgraph[community_id] = address_to_node
        
        for index, row in graph_df.iterrows():
            adj_matrix[address_to_node[row["From"]]][address_to_node[row["To"]]] = row["value"]
        
       
        return adj_matrix
    
    def perform_analysis(self, louvain_threshold, min_community_size):

        self.remove_all_community_ids()
        self.run_louvain(louvain_threshold)
        self.find_large_communities(min_community_size)
        self.get_all_addresses_from_large_communities()
        self.find_all_subgraphs()
        self.calculate_centrality_score_for_every_community()
        self.calculate_metrics_for_all_subgraphs()
        self.calculate_degree_centrality_for_all_nodes()
        
    def print_metrics_by_community(self):
        
        metrics_by_community = self.metrics_by_community
        metric_names = self.metric_names

        ami = np.array([])
        dc = np.array([])
        a = np.array([])
        r = np.array([])
        community_sizes = np.array([])
        number_of_edges = np.array([])

        for key in metrics_by_community:
            community_sizes = np.append(community_sizes, len(self.addresses_in_large_communities[key]))
            number_of_edges = np.append(number_of_edges, np.count_nonzero(self.subgraphs[key]))
            ami = np.append(ami, metrics_by_community[key][metric_names[0]])
            dc = np.append(dc, metrics_by_community[key][metric_names[1]])
            a = np.append(a, metrics_by_community[key][metric_names[2]])
            r = np.append(r, metrics_by_community[key][metric_names[3]])

        all_metrics = [ami, dc, a, r]

        for count, metric in enumerate(all_metrics):
            print("Mean of ", metric_names[count], " is ", np.average(metric))
            print("Standard Deviation of ", metric_names[count], " is ", np.std(metric))
            print("Max of ", metric_names[count], " is ", np.max(metric))
            print("Min of ", metric_names[count], " is ", np.min(metric))
            print(" ")

    def find_shortest_path(self, node1_id, node2_id):
        query_string = """MATCH (a1:Address { id: "%s" }),(a2:Address { id: "%s" }), path = shortestPath((a1)-[*..15]->(a2))
                            RETURN path""" %(node1_id, node2_id)
        
        path = self.conn.query(query_string)
        node_ids = []

        for record in path:
            for node in record["path"].nodes:
                nodeid = node.get('id')
                node_ids.append(nodeid)


        return node_ids
    
    
    def find_all_shortest_paths(self, node1_id, node2_id):
        query_string = """MATCH (a1:Address { id: "%s" }),(a2:Address { id: "%s" }), path = allShortestPaths((a1)-[*..15]->(a2))
                            RETURN path""" %(node1_id, node2_id)
        
        paths = self.conn.query(query_string)
        node_ids = []

        for count, record in enumerate(paths):
            for node in record["path"].nodes:
         
                nodeid = node.get('id')
                node_ids.append(nodeid)


        return node_ids
