# Scopes extraction

Due to unideal work of initial parser, it fails to extract the exact scope of the variable. It always contains a bigger scope, which could make learning process of the model harder.

This notebook takes initial dataset and extracts exact variable scopes from each dataset entry. Moreover, it may find several scopes for different variables with similar name in one code snippet. You would see the examples later.

## Initialization stage

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import IPython

dataset_name = 'parser_dataset.csv'
postprocessed_dataset_name = 'postprocessed_dataset.csv'

In [2]:
columns_list = ['Name', 'Code', 'Category']

try:
    df = pd.read_csv(dataset_name)
except Exception: 
    df = pd.DataFrame(columns = columns_list)
    
assert (df.columns == columns_list).all()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2903 entries, 0 to 2902
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      2903 non-null   object
 1   Code      2903 non-null   object
 2   Category  2903 non-null   object
dtypes: object(3)
memory usage: 68.2+ KB


## Scopes extraction function

In [3]:
def extract_scopes(row):
    '''
        Extracts exact scope(s) of variable in provided dataframe entry
        
        Returns dict with the dataframe columns and new dataframe entry(ies)
    '''
    name = row[0]
    code = row[1]

    idx = 0
    bracket_idx = 0
    open_bracket_pos = [None]
    
    scopes = list()
    scope_processed = None
    for_processed = None
    
    def check(key):
        prevIdx = idx - 1
        postIdx = idx + len(key)

        if code[idx : postIdx] != key:
            return False

        return (prevIdx == -1 or not code[prevIdx].isalnum()) and \
               (postIdx >= len(code) or not code[postIdx].isalnum())

    
    while 0 <= idx < len(code):
        if idx < len(code) - 1 and code[idx:idx+2] == r'/*':
            idx = code.find(r'*/', idx)
        if idx < len(code) - 1 and code[idx:idx+2] == r'//':
            idx = code.find('\n', idx)
        if code[idx] == r'"':
            idx = code.find(r'"', idx + 1);
        
        if idx == -1:
            continue
            
        if code[idx] == '{':
            bracket_idx += 1
            
            if bracket_idx < len(open_bracket_pos):
                open_bracket_pos[bracket_idx] = idx
            else:
                open_bracket_pos.append(idx)
            
            if for_processed is not None:
                open_bracket_pos[bracket_idx] = for_processed
                for_processed = None
                
        elif code[idx] == '}':
            if scope_processed == bracket_idx:
                assert open_bracket_pos[scope_processed] is not None
                scopes.append(code[open_bracket_pos[scope_processed] : idx+1])
                scope_processed = None
            
            open_bracket_pos[bracket_idx] = None
            bracket_idx -= 1
            
        elif scope_processed is None and check('for'):
            for_processed = idx
            
            idx = code.find('(', idx) + 1
            bra_idx = 1
            is_name = False
            
            while bra_idx != 0:
                if code[idx:idx+2] == r'/*':
                    idx = code.find(r'*/', idx)
                if code[idx:idx+2] == r'//':
                    idx = code.find('\n', idx)
                if code[idx] == r'"':
                    idx = code.find(r'"', idx + 1);
                
                if code[idx] == '(':
                    bra_idx += 1
                    
                elif code[idx] == ')':
                    bra_idx -= 1

                elif not is_name and check(name):
                    is_name = True
                
                idx += 1
            
            if is_name:
                while code[idx] != '{' and code[idx] != ';':
                    idx += 1

                if code[idx] == ';':
                    scopes.append(code[for_processed : idx+1])
                    for_processed = None
                else:
                    scope_processed = bracket_idx+1
                    continue
            
        
        elif scope_processed is None and check(name):
            scope_processed = bracket_idx
        
        idx += 1
        
#     print("------------Scopes---------------")
#     for scope in scopes:
#         print(scope)
#         print('-'*20)
    
    elems = len(scopes)
    return {'Name': [name]*elems,
            'Code': scopes,
            'Category': [row[2]]*elems}
    

### Examples of function work

In [4]:
def print_example(idx):
    print('Initial code snippet')
    print()
    print(df_to_np[idx][1])
    print('-'*50)
    print('Function return')
    print()
    func_return = extract_scopes(df_to_np[idx])
    for snippets in func_return['Code']:
        print(snippets)
        print('-'*30)

In [5]:
df_to_np = df[columns_list].to_numpy() 

For usual piece of code

In [6]:
print_example(1)

Initial code snippet

{
        staff = staff.stream().map(w -> {
            for (Staff updatedW : updatedStaff) {
                if (updatedW.getPersonalInformation().getId() == w.getPersonalInformation().getId()) {
                    return updatedW;
                }
            }
            return w;
        }).collect(Collectors.toCollection(ArrayList::new));
    }
--------------------------------------------------
Function return

for (Staff updatedW : updatedStaff) {
                if (updatedW.getPersonalInformation().getId() == w.getPersonalInformation().getId()) {
                    return updatedW;
                }
            }
------------------------------


Several variable scopes in one code snippet

In [7]:
print_example(0)

Initial code snippet

{
        ArrayList<Room> hotelRooms = new ArrayList<>();
        for (int i = 0; i < 15; i++) {
            hotelRooms.add(new HostelRoom(new StandardRoom(i + 1, 2, RoomType.ECONOMY)));
        }
        for (int i = 0; i < 15; i++) {
            hotelRooms.add(new ExpandableRoom(new StandardRoom(i + 16, 3, RoomType.LUX)));
        }
        Hotel hotel = new Hotel(hotelRooms);
        RoomManager roomManager = new RoomManager();
        StaffManager staffManager = new StaffManager();
        Human client1 = new Human("Kopeikina", "Anna");
        Human client2 = new Human("Tyulebaeva", "Karina");
        Human client3 = new Human("Domrachev", "Ivan");

        Human slave1 = new Human("Alentev", "Igor");

        Human slave2 = new Human("Asatullaev", "Maruf");

        staffManager.hireStaff(hotel, new PlumberCreator().createStaff(slave1, StaffType.IRREGULAR));
        staffManager.hireStaff(hotel, new SecurityGuardCreator().createStaff(slave2, StaffType.IRREGU

For while loop

In [8]:
print_example(16)

Initial code snippet

{

        flowable = Flowable.range(1, 1000 * 1000).takeUntil(Flowable.fromCallable(new Callable<Object>() {
            @Override
            public Object call() {
                int c = count;
                while (items < c) { }
                return 1;
            }
        }).subscribeOn(Schedulers.single()));

        observable = Observable.range(1, 1000 * 1000).takeUntil(Observable.fromCallable(new Callable<Object>() {
            @Override
            public Object call() {
                int c = count;
                while (items < c) { }
                return 1;
            }
        }).subscribeOn(Schedulers.single()));
    }
--------------------------------------------------
Function return

{
                int c = count;
                while (items < c) { }
                return 1;
            }
------------------------------
{
                int c = count;
                while (items < c) { }
                return 1;
            }
---

## Function execution

In [9]:
df_to_np = df[columns_list].to_numpy() 
function_results = [extract_scopes(x) for x in df_to_np]

In [10]:
parsed_df_dict = {name: [] for name in columns_list}
for d in function_results:
    for name in columns_list:
        parsed_df_dict[name] += d[name] 
        
parsed_df = pd.DataFrame(parsed_df_dict)
parsed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3371 entries, 0 to 3370
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      3371 non-null   object
 1   Code      3371 non-null   object
 2   Category  3371 non-null   object
dtypes: object(3)
memory usage: 79.1+ KB


## Saving to file

In [11]:
parsed_df.to_csv(postprocessed_dataset_name, index = False)