# Playing with tree2vec

In [2]:
import clang.cindex

In [3]:
import os
import pandas as pd

In [4]:
# This cell might not be needed for you.
clang.cindex.Config.set_library_file('/lib/x86_64-linux-gnu/libclang-8.so.1')

Load in the juliet data set, and pick the first data point as an example

In [5]:
juliet = pd.read_csv("../data/juliet.csv.zip")

In [7]:
example = juliet.iloc[0]

Instantiate the clang parser and give it our example. We use `unsaved_files` to tell it to parse a file that doesn't actually exist on disk.

In [8]:
index = clang.cindex.Index.create()
translation_unit = index.parse(path=example.filename, unsaved_files=[(example.filename, example.code)])

`root` is the root note of the AST. Try to explore and figure out what this all means! It's pretty dense ha

In [9]:
root = translation_unit.cursor

I found this nice tutorial that helps to explain how the Python clang bindings canbe used to explore AST's: https://github.com/FraMuCoder/PyClASVi/blob/master/doc/python_clang_usage.md

In [10]:
def print_ast(cursor, deep=0):
    print(' '.join((deep*'    ', str(cursor.kind), str(cursor.spelling))))
    for child in cursor.get_children():
        print_ast(child, deep+1)

In [11]:
identifier = 1

def number_each_node(node):
    global identifier
    
    node.identifier = identifier
    identifier += 1
    
    node.children = list(node.get_children())
    for child in node.children:
        number_each_node(child)
        
number_each_node(root)

In [12]:
import snap

In [13]:
tree = snap.TNGraph.New()

def tree2edgelist(node):
    tree.AddNode(node.identifier)
    
    for child in node.children:
        tree2edgelist(child)
        tree.AddEdge(node.identifier, child.identifier)
        
tree2edgelist(root)

In [14]:
snap.SaveEdgeList(tree, '../data/mygraph.txt')

In [15]:
with open('../data/mygraph.txt') as f:
    print(f.read())

# Directed graph: ../data/mygraph.txt 
# Nodes: 2538 Edges: 2537
# FromNodeId	ToNodeId
1	2
1	3
1	4
1	5
1	6
1	7
1	18
1	30
1	32
1	33
1	35
1	36
1	38
1	48
1	50
1	52
1	53
1	56
1	59
1	62
1	65
1	71
1	78
1	82
1	87
1	93
1	100
1	104
1	106
1	112
1	114
1	117
1	120
1	123
1	126
1	129
1	132
1	135
1	137
1	140
1	143
1	146
1	152
1	154
1	156
1	158
1	162
1	166
1	171
1	173
1	175
1	177
1	179
1	181
1	183
1	185
1	187
1	191
1	195
1	199
1	204
1	209
1	214
1	219
1	221
1	223
1	228
1	233
1	237
1	239
1	244
1	249
1	252
1	258
1	262
1	264
1	268
1	273
1	276
1	281
1	286
1	289
1	294
1	300
1	304
1	310
1	314
1	318
1	320
1	325
1	330
1	333
1	334
1	338
1	344
1	346
1	347
1	348
1	349
1	350
1	351
1	352
1	353
1	354
1	355
1	356
1	357
1	358
1	360
1	362
1	364
1	366
1	368
1	370
1	372
1	374
1	375
1	376
1	377
1	378
1	379
1	380
1	381
1	382
1	383
1	384
1	385
1	386
1	387
1	388
1	391
1	395
1	396
1	397
1	398
1	399
1	400
1	401
1	402
1	403
1	404
1	405
1	406
1	407
1	408
1	409
1	410
1	411
1	412
1	413
1	414
1	415
1	416
1	417
1	419
1	420
1	421
1	4

In [16]:
def print_node_identifiers(node):
    print(str(node.kind) + ' ' + str(node.identifier))
    for child in node.children:
        print_ast(child) 

### Converting AST to the tree2vec format

Example:
```
[
   {
      "type":"FUN",
      "children":[
         {
            "type":"MODIFIER_LIST",
            "children":[
               {
                  "type":"override",
               }
            ]
         },
         {
            "type":"IDENTIFIER",
         },
         {
            "type":"VALUE_PARAMETER_LIST",
            "children":[
               {
                  "type":"LPAR",
               },
               {
                  "type":"VALUE_PARAMETER",
                  "children":[
                     {
                         "type":"IDENTIFIER",
                     }
                  ]
               }
            ]
         }
      ]
   }
]
```

In [19]:
root.kind

CursorKind.TRANSLATION_UNIT

In [20]:
root.children

[<clang.cindex.Cursor at 0x7f81db5a6a60>,
 <clang.cindex.Cursor at 0x7f81db5a6950>,
 <clang.cindex.Cursor at 0x7f81db5a6840>,
 <clang.cindex.Cursor at 0x7f81db5a6730>,
 <clang.cindex.Cursor at 0x7f81db5a6620>,
 <clang.cindex.Cursor at 0x7f81db5a6598>,
 <clang.cindex.Cursor at 0x7f81db5a6488>,
 <clang.cindex.Cursor at 0x7f81db5a6400>,
 <clang.cindex.Cursor at 0x7f81db5a6378>,
 <clang.cindex.Cursor at 0x7f81db5a60d0>,
 <clang.cindex.Cursor at 0x7f81db5a62f0>,
 <clang.cindex.Cursor at 0x7f82045dc598>,
 <clang.cindex.Cursor at 0x7f8204516378>,
 <clang.cindex.Cursor at 0x7f8204516488>,
 <clang.cindex.Cursor at 0x7f82045168c8>,
 <clang.cindex.Cursor at 0x7f82086bd8c8>,
 <clang.cindex.Cursor at 0x7f82086bdbf8>,
 <clang.cindex.Cursor at 0x7f82086bdb70>,
 <clang.cindex.Cursor at 0x7f81db5c3048>,
 <clang.cindex.Cursor at 0x7f81db5c30d0>,
 <clang.cindex.Cursor at 0x7f81db5c3158>,
 <clang.cindex.Cursor at 0x7f81db5c31e0>,
 <clang.cindex.Cursor at 0x7f81db5c3268>,
 <clang.cindex.Cursor at 0x7f81db5

In [23]:
def convert_to_json(node):
    node_json = {}
    node_json['type'] = str(node.kind)
    
    if len(node.children) > 0:
        node_json['children'] = [convert_to_json(child) for child in node.children]
        
    return node_json


In [24]:
convert_to_json(root)

{'type': 'CursorKind.TRANSLATION_UNIT',
 'children': [{'type': 'CursorKind.TYPEDEF_DECL'},
  {'type': 'CursorKind.TYPEDEF_DECL'},
  {'type': 'CursorKind.TYPEDEF_DECL'},
  {'type': 'CursorKind.TYPEDEF_DECL'},
  {'type': 'CursorKind.TYPEDEF_DECL'},
  {'type': 'CursorKind.STRUCT_DECL',
   'children': [{'type': 'CursorKind.FIELD_DECL'},
    {'type': 'CursorKind.UNION_DECL',
     'children': [{'type': 'CursorKind.FIELD_DECL'},
      {'type': 'CursorKind.FIELD_DECL',
       'children': [{'type': 'CursorKind.INTEGER_LITERAL'}]}]},
    {'type': 'CursorKind.FIELD_DECL',
     'children': [{'type': 'CursorKind.UNION_DECL',
       'children': [{'type': 'CursorKind.FIELD_DECL'},
        {'type': 'CursorKind.FIELD_DECL',
         'children': [{'type': 'CursorKind.INTEGER_LITERAL'}]}]}]}]},
  {'type': 'CursorKind.TYPEDEF_DECL',
   'children': [{'type': 'CursorKind.STRUCT_DECL',
     'children': [{'type': 'CursorKind.FIELD_DECL'},
      {'type': 'CursorKind.UNION_DECL',
       'children': [{'type': 'C

In [25]:
import json

In [26]:
with open('../data/tree2vec_example.json', 'w') as file:
    json.dump(convert_to_json(root), file)