In [1]:
from lang_processors.cpp_processor import CppProcessor
processor = CppProcessor(root_folder='lang_processors/tree-sitter')

In [2]:
with open('switch.cpp', 'r', encoding='utf-8') as f:
    codes = f.read()

In [3]:
from clean_comment import exclude_comment
out = exclude_comment(codes)
# print(out)

In [4]:
from tree_sitter import Language, Parser

CPP = Language('lang_processors/tree-sitter/cpp.so', 'cpp')
parser = Parser()
parser.set_language(CPP)

In [5]:
out = out.replace("\r", "")
print(out)
tree = parser.parse(bytes(out,"utf8"))

#include <iostream>
using namespace std;
int num = 0;
int main() {
    switch (num) {
    case 0:
        cout << "0" << endl;
    case 1:
        cout << "1" << endl;
    case 2:
        cout << "2" << endl;
    default:
        cout << "default" << endl;
    }
}


In [6]:
ast_nodes_type_string = ["comment", "string_literal", "character_literal"]
def dfs(code, node, tokens, tokens_type):
    print(node)
    if len(node.children) == 0 or node.type in ast_nodes_type_string:
        snippet = code[node.start_byte: node.end_byte]
        if isinstance(snippet, bytes):
            snippet = snippet.decode("utf8")
        if len(snippet) > 0:
            tokens.append(snippet)
            tokens_type.append(node.type)
        return
    for child in node.children:
        dfs(code, child, tokens, tokens_type)

In [7]:
tokens = []
tokens_types = []
print(dfs(out, tree.root_node, tokens, tokens_types))

<Node type=translation_unit, start_point=(0, 0), end_point=(14, 1)>
<Node type=preproc_include, start_point=(0, 0), end_point=(1, 0)>
<Node type="#include", start_point=(0, 0), end_point=(0, 8)>
<Node type=system_lib_string, start_point=(0, 9), end_point=(0, 19)>
<Node type=using_declaration, start_point=(1, 0), end_point=(1, 20)>
<Node type="using", start_point=(1, 0), end_point=(1, 5)>
<Node type="namespace", start_point=(1, 6), end_point=(1, 15)>
<Node type=identifier, start_point=(1, 16), end_point=(1, 19)>
<Node type=";", start_point=(1, 19), end_point=(1, 20)>
<Node type=declaration, start_point=(2, 0), end_point=(2, 12)>
<Node type=primitive_type, start_point=(2, 0), end_point=(2, 3)>
<Node type=init_declarator, start_point=(2, 4), end_point=(2, 11)>
<Node type=identifier, start_point=(2, 4), end_point=(2, 7)>
<Node type="=", start_point=(2, 8), end_point=(2, 9)>
<Node type=number_literal, start_point=(2, 10), end_point=(2, 11)>
<Node type=";", start_point=(2, 11), end_point=(2,

In [8]:

lines = []
line = []
include_flag = 0
for_flag = 0
if_flag = 0
while_flag = 0
parentheses_flag = 0
else_flag = 0
do_flag = 0
case_default_flag = 0
for token, token_type in zip(tokens, tokens_types):
    if (token_type != '{' and token_type != ';') and (if_flag == 2 or for_flag == 2 or while_flag == 2):
        lines.append(line)
        line = []
        if_flag = 0
        for_flag = 0
        while_flag = 0
    
    # else
    if (token_type != 'if' and token_type !='{') and (else_flag == 1 or do_flag == 1):
        lines.append(line)
        line = []
        else_flag = 0
        do_flag = 0
    
    if token_type == '{':
        line.append(token)
        lines.append(line)
        line = []
        for_flag = 0
        if_flag = 0
        else_flag = 0
        do_flag = 0
        while_flag = 0
    elif token_type == '}':
        lines.append(line)
        lines.append(token)
        line = []
    elif token_type == 'if':
        line.append(token)
        if_flag = 1
        else_flag = 0
    elif token_type == 'else':
        line.append(token)
        else_flag = 1
    elif token_type == 'do':
        line.append(token)
        do_flag = 1
    elif token_type == 'while':
        line.append(token)
        while_flag = 1
    elif token_type == 'case' or token_type == 'defalut':
        line.append(token)
        case_default_flag = 1
    elif token_type == ':':
        line.append(token)
        lines.append(line)
        line = []
        case_default_flag = 0
    elif token_type == '(' and (if_flag == 1 or for_flag == 1 or while_flag == 1):
        line.append(token)
        parentheses_flag += 1
    elif token_type == ')' and (if_flag == 1 or for_flag == 1 or while_flag == 1):
        line.append(token)
        parentheses_flag -= 1
        if parentheses_flag == 0:
            if_flag = 2
            for_flag == 2
            while_flag == 2
    elif token_type == '#include':
        line.append(token)
        include_flag = 1
    elif token_type == 'string_literal' or token_type == 'system_lib_string':
        line.append(token)
        if include_flag == 1:
            lines.append(line)
            line = []
            include_flag = 0
    elif token_type == 'for':
        line.append(token)
        for_flag = 1
    elif token_type == ';':
        line.append(token)
        if for_flag == 0:
            lines.append(line)
            line = []
    else:
        line.append(token)
        
for line in lines:
    code = ' '.join(line)
    if code != '':
        print(code)

#include <iostream>
using namespace std ;
int num = 0 ;
int main ( ) {
switch ( num ) {
case 0 :
cout << "0" << endl ;
case 1 :
cout << "1" << endl ;
case 2 :
cout << "2" << endl ;
default :
cout << "default" << endl ;
}
}


In [9]:
for line in lines:
    result = processor.tokenize_code(' '.join(line))
    print(result)

['#include', '<iostream>']
['using', 'namespace', 'std', ';']
['int', 'num', '=', '0', ';']
['int', 'main', '(', ')', '{']
['switch', '(', 'num', ')', '{']
['case', '0', ':']
['cout', '<<', '"0"', '<<', 'endl', ';']
['case', '1', ':']
['cout', '<<', '"1"', '<<', 'endl', ';']
['case', '2', ':']
['cout', '<<', '"2"', '<<', 'endl', ';']
['default', ':']
['cout', '<<', '" default "', '<<', 'endl', ';']
[]
['}']
[]
['}']


In [10]:
result2 = processor.detokenize_code(result)
print(result2)

}



In [None]:
from formatter import c_formatter
c_formatter()