# Splitting Data into Training/Test Sets and Data Tokenization

In [1]:
import os
import glob
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import shutil

In [2]:
input_path = os.path.join("data", "smell_dataset")
smell = "UnutilizedAbstraction"
dim = "2d"
output_dir = "train_dev_split"

In [3]:
pos = list(glob.iglob(os.path.join(input_path, smell, "Positive", "*.code")))
neg = list(glob.iglob(os.path.join(input_path, smell, "Negative", "*.code")))

In [4]:
len(pos), len(neg)

(1, 6599)

In [5]:
pos = [(_p, 1) for _p in pos]
neg = [(_n, 0) for _n in neg]

In [6]:
samples = pos+neg

In [7]:
df = pd.DataFrame(data=samples, columns=["sample", "label"])

In [8]:
X, y = df["sample"], df["label"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
train = pd.concat([X_train, y_train], axis=1)

In [11]:
train_dst_folder_pos = os.path.join("data", output_dir, smell, "train", "Positive")
train_dst_folder_neg = os.path.join("data", output_dir, smell, "train", "Negative")

In [13]:
def create_folders_if_not_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

In [14]:
create_folders_if_not_exists(train_dst_folder_pos)
create_folders_if_not_exists(train_dst_folder_neg)

In [15]:
for _, sample, label in tqdm(train.itertuples()):
    if label == 0:
        shutil.copy2(sample, train_dst_folder_neg)
    else:
        shutil.copy2(sample, train_dst_folder_pos)

4422it [00:01, 3154.32it/s]


In [16]:
test_dst_folder_pos = os.path.join("data", output_dir, smell, "test", "Positive")
test_dst_folder_neg = os.path.join("data", output_dir, smell, "test", "Negative")
create_folders_if_not_exists(test_dst_folder_pos)
create_folders_if_not_exists(test_dst_folder_neg)

In [17]:
test = pd.concat([X_test, y_test], axis=1)

In [18]:
for _, sample, label in tqdm(test.itertuples()):
    if label == 0:
        shutil.copy2(sample, test_dst_folder_neg)
    else:
        shutil.copy2(sample, test_dst_folder_pos)

2178it [00:00, 3444.86it/s]


In [19]:
from tokenizer import run_tokenizer

In [20]:
out_base_path = os.path.join("data", "train_test_dev_split_tokenized", smell)
tokenizer_input_base_path = os.path.join('data', 'train_dev_split')
tokenizer_exe_path = os.path.join('apps', 'tokenizer', 'tokenizer')
splits = ["train", "test"]

In [21]:
create_folders_if_not_exists(out_base_path)

In [22]:
for split in splits:
    
    output_path = os.path.join(out_base_path, split)
    create_folders_if_not_exists(output_path)
    
    folder = output_path
    
    tokenizer_level = "file"
    dim_str = "2d"

    print("Processing {0} split for dimension {1}".format(split, dim_str))

    print("\t processing positive cases...")
    cur_folder = os.path.join(tokenizer_input_base_path, smell, split, "Positive")
    out_folder = os.path.join(os.path.join(out_base_path, split, dim_str), "Positive")
    run_tokenizer(cur_folder, out_folder, tokenizer_exe_path, 'Java', tokenizer_level)

    print("\t processing negative training cases...")
    cur_folder = os.path.join(tokenizer_input_base_path, smell, split, "Negative")
    out_folder = os.path.join(os.path.join(out_base_path, split, dim_str), "Negative")
    run_tokenizer(cur_folder, out_folder, tokenizer_exe_path, 'Java', tokenizer_level)
print("Tokenizing done.")    

Processing train split for dimension 2d
	 processing positive cases...
		processing atmosphere_atmosphere_atmosphere_atmosphere_org.atmosphere.voidClass0VoidClass.code
	 processing negative training cases...
		processing apache_jmeter_apache_jmeter_org.apache.jmeter.util1563JSR223BeanInfoSupport.code
		processing robolectric_robolectric_robolectric_robolectric_org.robolectric.res2807AttributeResource.code
		processing robolectric_robolectric_robolectric_robolectric_org.robolectric.annotation.processing.shadows475ShadowRealObjectWithEmptyImplements.code
		processing robolectric_robolectric_robolectric_robolectric_org.robolectric.android2887ShadowCustomPaint.code
		processing eclipse-vertx_vert.x_eclipse-vertx_vert.x_io.vertx.core.net.endpoint89ServerInteraction.code
		processing robolectric_robolectric_robolectric_robolectric_org.robolectric.shadows1063ShadowTime.code
		processing robolectric_robolectric_robolectric_robolectric_org.robolectric.shadows2162SQLiteStatementTest.code
		proce