In [3]:
import pickle
import re

DATA_PATH = '/home/gangda/workspace/auto-search-agent/graph_encoder/DATA'

repo = 'django__django-10914'
G = pickle.load(open(DATA_PATH + f'/dependency_graph_v2/{repo}.pkl', 'rb'))

In [29]:
list(G.nodes.keys())[100:120]

['release/fabfile.py:release',
 'release/fabfile.py:source_tarball',
 'release/fabfile.py:build_docs',
 'release/fabfile.py:copy_release_files',
 'release/fabfile.py:show_files',
 'release/fabfile.py:compare_tar_against_git',
 'release/fabfile.py:md5',
 'release/fabfile.py:size',
 'release/fabfile.py:table',
 'release/fabfile.py:table.tag',
 'release/fabfile.py:table.a_href',
 'release/fabfile.py:get_tarball_name',
 'release/fabfile.py:tarball_formatter',
 'release/fabfile.py:get_previous_version_tag',
 'release/fabfile.py:get_authors',
 'release/fabfile.py:get_authors.lastnamekey',
 'release/fabfile.py:print_authors',
 'release/fabfile.py:check_tag_exists',
 'release/fabfile.py:update_websites',
 'release/fabfile.py:get_location']

In [34]:
class RepoSearcher:
    def __init__(self, graph):
        self.G = graph

    def subgraph(self, nids):
        subg = self.G.subgraph(nids)
        edges = list(subg.edges(data='type'))
        node_data = self.get_data(nids)
        return edges, node_data

    def one_hop_neighbors(self, nid, return_data=False):
        # get one-hop neighbors from networkx graph
        if not return_data:
            return set(list(self.G.predecessors(nid)) + list(self.G.successors(nid)))

        neigh_data = []
        for pn in G.predecessors(nid):
            ndata = self.get_data([pn])[0]
            for key, attr in G.get_edge_data(pn, nid).items():
                ndata['relation'] = attr['type'] + '-by'
                neigh_data.append(ndata)
        for sn in G.successors(nid):
            ndata = self.get_data([sn])[0]
            for key, attr in G.get_edge_data(nid, sn).items():
                ndata['relation'] = attr['type']
                neigh_data.append(ndata)

        return neigh_data

    def two_hop_neighbors(self, nid, return_data=False):
        # get two-hop neighbors from networkx graph
        one_hop = self.one_hop_neighbors(nid)
        two_hop = []
        for nid in one_hop:
            two_hop.extend(self.one_hop_neighbors(nid))
        two_hop = set(two_hop)

        return self.get_data(two_hop) if return_data else two_hop

    def dfs(self, root_nid, depth):
        # perform depth-first search on networkx graph
        visited = []
        stack = [(root_nid, 0)]
        while stack:
            nid, level = stack.pop()
            if nid not in visited:
                visited.append(nid)
                if level < depth:
                    stack.extend([(n, level + 1) for n in self.one_hop_neighbors(nid)])
        return visited

    def bfs(self, root_nid, depth):
        # perform breadth-first search on networkx graph
        visited = []
        queue = [(root_nid, 0)]
        while queue:
            nid, level = queue.pop(0)
            if nid not in visited:
                visited.append(nid)
                if level < depth:
                    queue.extend([(n, level + 1) for n in self.one_hop_neighbors(nid)])
        return visited

    def get_all_nodes_by_file(self, file_pattern, ntype=None):
        all_inner_nodes = []
        for node, _ntype in self.G.nodes(data='type'):
            if _ntype == 'file' and re.match(file_pattern, node):
                all_inner_nodes.extend(self.get_all_inner_nodes(node, ntype))
        return self.get_data(all_inner_nodes)

    def get_all_inner_nodes(self, src_node, ntype=None):
        assert ntype in ['function', 'class', None]
        inner_nodes = []
        for _, dst_node, attr in self.G.edges(src_node, data=True):
            if attr['type'] == 'contains':
                if self.G.nodes[dst_node]['type'] == ntype or ntype is None:
                    inner_nodes.append(dst_node)
                    inner_nodes.extend(self.get_all_inner_nodes(dst_node, ntype))
        return inner_nodes

    def get_data(self, nids):
        rtn = []
        for nid in nids:
            node = self.G.nodes[nid]
            path_list = nid.split(':')
            rtn.append(
                {
                    'file_path': path_list[0],
                    'module_name': path_list[1] if len(path_list) > 1 else '',
                    'type': node['type'],
                    # 'code': node['code'],
                    'start_line': node.get('start_line', 0),
                    'end_line': node.get('end_line', 0),
                }
            )
        return rtn

## search_dependency_graph_one_hop

In [36]:
nid = 'django/core/files/uploadedfile.py:TemporaryUploadedFile'

searcher = RepoSearcher(G)
data = searcher.one_hop_neighbors(nid, return_data=True)

len(data)

11

## extract_subgraph_with_nodes

In [58]:
nids = [
    'release/fabfile.py',
    'release/fabfile.py:show_files',
    'release/fabfile.py:compare_tar_against_git',
    'release/fabfile.py:tarball_formatter',
]

searcher = RepoSearcher(G)
data = searcher.subgraph(nids)

data

([('release/fabfile.py:show_files',
   'release/fabfile.py:tarball_formatter',
   'invokes'),
  ('release/fabfile.py:compare_tar_against_git',
   'release/fabfile.py:show_files',
   'invokes'),
  ('release/fabfile.py', 'release/fabfile.py:show_files', 'contains'),
  ('release/fabfile.py',
   'release/fabfile.py:compare_tar_against_git',
   'contains'),
  ('release/fabfile.py', 'release/fabfile.py:tarball_formatter', 'contains')],
 [{'file_path': 'release/fabfile.py',
   'module_name': '',
   'type': 'file',
   'start_line': 0,
   'end_line': 0},
  {'file_path': 'release/fabfile.py',
   'module_name': 'show_files',
   'type': 'function',
   'start_line': 329,
   'end_line': 354},
  {'file_path': 'release/fabfile.py',
   'module_name': 'compare_tar_against_git',
   'type': 'function',
   'start_line': 446,
   'end_line': 486},
  {'file_path': 'release/fabfile.py',
   'module_name': 'tarball_formatter',
   'type': 'function',
   'start_line': 640,
   'end_line': 641}])

## get_all_nodes 

In [108]:
pattern = r'release\/[^\/]*file\.py$'

searcher = RepoSearcher(G)
data = searcher.get_all_nodes_by_file(pattern, ntype='class')

data

[{'file_path': 'release/fabfile.py',
  'module_name': 'URLs',
  'type': 'class',
  'start_line': 1178,
  'end_line': 1209},
 {'file_path': 'release/fabfile.py',
  'module_name': 'AuthenticationFailed',
  'type': 'class',
  'start_line': 1212,
  'end_line': 1213}]

In [107]:
len(data)

50