# Code miner

In [None]:
import pandas as pd
import os
import IPython
import time

In [None]:
repos_dir = [r'./reps/java-design-patterns']
# repos_dir = [r'./reps/test']

# dataset_name = 'parser_dataset.csv'
# files_progress_name = 'file_progress.txt'

dataset_name = 'parser_dataset_test.csv'
files_progress_name = 'file_progress_test.txt'



# String separator for GUI
sep = '-'*40

# Symbol, which separates completed directories 
dir_sep = '$'

# Decrease, if you want text to disappear faster 
sleep_time = 2

# Keywords, which parser tries to find in the code
# Boolean value in the tuple shows, should parser 
# look for the key as a separate word (true) or 
# it does not matter (false)
search_keywords = [('for', True), ('while', True), ('iterator', False)]

#Category names could not start with the same letter!
categories_available = ['loop_control', 'iterator', 'maybe_loop_control', 'break_loop_control']

columns_list = ['Name', 'Code', 'Category']

In [None]:
#Trying to open existing dataset
new_data = {col: [] for col in columns_list}
try:
    dataset = pd.read_csv(dataset_name)
except Exception: 
    dataset = pd.DataFrame(columns = columns_list)

dataset.info()

In [None]:
# Check for columns compatability
assert (dataset.columns == columns_list).all()

In [None]:
def text_from_file(path):
    with open(path, 'rb') as f:
        text = f.read()
    return text.decode('UTF-8')

In [None]:
def print_gui(code_fragment, keyword_found):
    print(sep)
    print(f'Parser found keyword {keyword_found}!\n')
    print('List all needed variables in the form:')
    print('"varName typeName"')
    print(f'There are only the following variable categories: {categories_available}.')
    print('You can write either full category name, or only first letter.\n')
    print('To end -- press Enter with empty line. To restart -- type "$r" + Enter')
    print('Invalid input would result in force restart.')
    print(sep)
    print(f'Code fragment:')
    print()
    print(code_fragment)
    print(sep)
    
    
def show_to_user(code_fragment, keyword_found, new_data):
    names = categories = list()  
    while True:
        time.sleep(sleep_time)
          
#         IPython.display.clear_output()
        print_gui(code_fragment, keyword_found)
        
        line = input()  
        names = list()
        categories = list()  
          
        while line != '' and line.find('$r') == -1:
            words = line.split(' ')
            if len(words) != 2:
                print('Invalid words amount! Restarting...')
                line = '$r'
                break        
            if code_fragment.find(words[0]) == -1:
                print('Invalid variable name! Restarting...')
                line = '$r'
                break
            validType = False
            for cat in categories_available:
                if words[1] == cat or words[1] == cat[0] or words[1] == cat[0:2]:
                    categories.append(cat)
                    validType = True
                    break        
            if not validType:
                print('Invalid category! Restarting...')
                line = '$r'
                break    
            names.append(words[0])
            line = input()
            
        if line.find('$r') != -1:
            print("Restarting...")
            continue
        break        
    
    new_data['Name'] += names
    
    for i in range(len(names)):
        new_data['Code'].append(code_fragment)
    
    new_data['Category'] += categories
    
    print(sep)
    print('New samples successfully added to database!')
   

In [None]:
def try_find_variables_depricated(code, new_data):
    lcode = code.lower()
    for (key, isWord) in search_keywords:
        idx = lcode.find(key)
        prevIdx = idx - 1
        postIdx = idx + len(key)
        if idx != -1:
            if not isWord or ((prevIdx == -1 or not lcode[prevIdx].isalnum()) and (postIdx >= len(code) or not lcode[postIdx].isalnum())):
                show_to_user(code, key, new_data)
                return True
            
    return False

In [None]:
def check(code, idx, key, isWord):
    prevIdx = idx - 1
    postIdx = idx + len(key)
    
    if code[idx : postIdx] != key:
        return False
    
    return not isWord or ((prevIdx == -1 or not code[prevIdx].isalnum()) and 
                          (postIdx >= len(code) or not code[postIdx].isalnum()))


def try_find_variables(code, new_data):
    lcode = code.lower()
    
    for (key, isWord) in search_keywords:
        cur_idx = 0
        key_len = len(key)
        
        while cur_idx != len(code) - 1 :
            if cur_idx < len(code) - 1 and lcode[cur_idx:cur_idx+2] == r'/*':
                cur_idx = lcode.find(r'*/', cur_idx) + 1
            if cur_idx < len(code) - 1 and lcode[cur_idx:cur_idx+2] == r'//':
                cur_idx = lcode.find('\n', cur_idx) + 1
            if lcode[cur_idx] == r'"':
                cur_idx = lcode.find(r'"', cur_idx + 1);
                
            if check(lcode, cur_idx, key, isWord):
                show_to_user(code, key, new_data)
                return True

            cur_idx += 1
            
    return False

In [None]:
code_samples = []
files_amount = 0

# Traversing all repositories from the list, and 
# saving code from *.java files and their pathes
for rep in repos_dir:
    walk = os.walk(rep)

    for root, dirs, files in walk:
        for file in files:
            if file.endswith(".java"):
                files_amount += 1
                path = root + '/' + file
                code = text_from_file(path)
                code_samples.append((path, code))
files_amount

In [None]:
# Opening list of currently proceeded files
files_completed = set()

try:
    with open(files_progress_name, 'r') as f:
        files_completed = set(f.read().split(dir_sep))
except Exception:
    pass

In [None]:
current_progress = len(files_completed)
current_progress

In [None]:
def class_check(code, idx):
    key = "class"
    
    prevIdx = idx - 1
    postIdx = idx + len(key)
    
    if code[idx : postIdx] != key:
        return False
    
    return ((prevIdx == -1 or code[prevIdx].isspace()) and 
            (postIdx >= len(code) or code[postIdx].isspace()))

In [None]:
for (file, code) in code_samples:
    print(f"Iterating file{file}")
    print(sep)
    if file in files_completed:
        continue
    files_completed.add(file)
    
    idx = 0
    
    bracket_idx = 0
    class_bracket_idx = 0
    method_begin_idx = 0
    
    variableAdded = False
    if_method_proceeded = False
    if_brackets_open = False
    if_comment_open = False
    if_multiline_comment_open = False
         
    while idx < len(code):                
        if code[idx] == '"':
            tmp = idx
            idx = code.find(r'"', idx + 1)
            print(f"String{code[tmp:idx+1]} is skipped")
        if idx+1 < len(code) and code[idx : idx+2] == '//':
            tmp = idx
            idx = code.find('\n', idx) + 1
            print(f"String {code[tmp:idx+1]} is skipped")
        if idx+1 < len(code) and code[idx : idx+2] == '/*':
            tmp = idx
            idx = code.find('*/', idx) + 2
            print(f"String {code[tmp:idx+1]} is skipped")
            
            
        if code[idx] == '{':
            bracket_idx += 1
            print(f"Open bracket detected! bracket_idx={bracket_idx}")
        if code[idx] == '}':
            bracket_idx -= 1
            print(f"Closed bracket detected! bracket_idx={bracket_idx}")
        
        if not if_method_proceeded: 
            if class_check(code, idx):
                print(f"Keyword {code[idx:idx+len('class')]} was found!")
                idx = code.find('{', idx)
                bracket_idx += 1
                class_bracket_idx += 1
                print(f"Now, bracket_idx={bracket_idx} and class_bracket_idx={class_bracket_idx}")

            elif class_bracket_idx > bracket_idx:
                print(f"Class ended! Now, class_bracket_idx={class_bracket_idx}")
                class_bracket_idx -= 1    
            
        if code[idx] == '{' and bracket_idx == class_bracket_idx+1 and not if_method_proceeded:
            print("Method started!")
            method_begin_idx = idx
            if_method_proceeded = True
        if code[idx] == '}' and bracket_idx == class_bracket_idx and if_method_proceeded:
            if method_begin_idx == 0:
                raise Error("Slicing error!")    
            
            print("Method ended!")
            print(code[method_begin_idx : idx+1])
            print(sep)
            time.sleep(sleep_time)
            if try_find_variables(code[method_begin_idx : idx+1], new_data):
                variableAdded = True
            method_begin_idx = 0
                
            if_method_proceeded = not if_method_proceeded
        idx += 1

    if bracket_idx != 0:
        files_completed.remove(file)
        raise NameError('Bracket error!')

    if variableAdded:
        print(f'You have finished {file}.')
        
IPython.display.clear_output()
print('All repos were parced!')        

In [None]:
print(f'Amount of examples was {len(dataset)}')
dataset = pd.concat([dataset, pd.DataFrame(new_data)], axis = 0, ignore_index = True)
print(f'Amount of examples now {len(dataset)}')

In [None]:
with open("file_progress.txt", 'w') as f:
    f.write(dir_sep.join(files_completed))
dataset.to_csv(dataset_name, index = False)

# Reseting new_data, so that once could continue working after safe
# Without restarting kernel
new_data = {col: [] for col in columns_list}

In [None]:
# Use for checking purposes
try:
    dataset = pd.read_csv(dataset_name)
except Exception: 
    dataset = pd.DataFrame(columns = columns_list)

dataset.info()

In [None]:
print(len(dataset[(dataset['Name'] == 'it') | (dataset['Name'] == 'iter')]))

idx_to_delete = []

for idx, row in dataset[(dataset['Name'] == 'it')].iterrows():
    print(row['Code'], row['Category'])
    if input() == 'y':
        idx_to_delete.append(idx)
    IPython.display.clear_output()
    
idx_to_delete

In [None]:
print(dataset['Name'].value_counts())

In [None]:
print(dataset['Category'].value_counts())