In [1]:
from sklearn.metrics import confusion_matrix
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import confusion_matrix, accuracy_score

def load_data(data_folder, refactoring_details_path):
    ref_details = pd.read_csv(refactoring_details_path)
    train_documents = []
    test_documents = []
    train_ids = []
    test_ids = []
    training_labels = {}
    testing_labels = {}

    # Initialize training labels for all refactoring types
    ref_types = [d.replace('_single', '') for d in os.listdir(data_folder) if '_single' in d]
    for ref_type in ref_types:
        training_labels[ref_type] = []
        testing_labels[ref_type] = []

    # Load training data from _single directories and assign labels
    for ref_type in os.listdir(data_folder):
        if '_single' in ref_type:
            ref_type_clean = ref_type.replace('_single', '')
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        train_documents.append(content)
                        train_ids.append(filename.split('_')[0])
                        # Initialize labels for all refactoring types as 0
                        for key in training_labels:
                            training_labels[key].append(0)
                        # Set the current refactoring type label to 1
                        training_labels[ref_type_clean][-1] = 1

    # Load testing data from _mixed directories and assign labels based on CSV
    for ref_type in os.listdir(data_folder):
        if '_mixed' in ref_type:
            ref_type_clean = ref_type.replace('_mixed', '')
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        test_documents.append(content)
                        file_id = filename.split('_')[0]
                        test_ids.append(file_id) 
                        # Initialize labels for all refactoring types as 0
                        for key in testing_labels:
                            testing_labels[key].append(0)
                        # Set labels based on occurrence counts from the CSV
                        for key in ref_types:
                            count = ref_details.loc[ref_details['id'] == int(file_id), key].values[0]
                            if count > 0:
                                testing_labels[key][-1] = 1

    return train_documents, train_ids, test_documents, test_ids, training_labels, testing_labels

def train_and_evaluate(train_docs, train_labels, test_docs, test_labels, path_save='analysis_results\\'):
    vectorizer=CountVectorizer()
    X_train = vectorizer.fit_transform(train_docs)
    X_test = vectorizer.transform(test_docs)
    classifiers = {}
    results = {}
    summary = {}
    
    for refactoring_type, labels in train_labels.items():
        clf = RandomForestClassifier()
        clf.fit(X_train, labels)
        classifiers[refactoring_type] = clf

        # Evaluate classifier
        predictions = clf.predict(X_test)
        true = np.array(test_labels[refactoring_type])
        cm = confusion_matrix(true, predictions)
        acc = accuracy_score(true, predictions)
        
        # Store results
        results[refactoring_type] = {
            'confusion_matrix': cm,
            'accuracy': acc
        }
        summary[refactoring_type] = [acc, *cm.ravel()]  # Flatten confusion matrix and prepend accuracy

    # Serialize results and summary
    with open(path_save + 'classification_summary.pkl', 'wb') as f:
        pickle.dump(summary, f)
    with open(path_save + 'classification_results.txt', 'w') as f:
        f.write(str(results))
    
    return classifiers, results

data_folder = "dataset_clean\\"
refactoring_details_path = 'analysis_results\\refactoring_details_post.csv'
train_docs, train_ids, test_docs, test_ids, train_labels, test_labels = load_data(data_folder, refactoring_details_path)
classifiers, results = train_and_evaluate(train_docs, train_labels, test_docs, test_labels)

print("Classification results saved successfully.")

print("Accuracy  and TN/TP/FN/FP for each refactoring type:")
for refactoring_type, result in results.items():
    print(f"{refactoring_type}: {round(result['accuracy'],3)} -- TN/TP/FN/FP : {result['confusion_matrix'][0][0]}/{result['confusion_matrix'][1][1]}/{result['confusion_matrix'][1][0]}/{result['confusion_matrix'][0][1]}")

Classification results saved successfully.
Accuracy  and TN/TP/FN/FP for each refactoring type:
Extract Interface: 0.881 -- TN/TP/FN/FP : 37/0/5/0
Extract Method: 0.405 -- TN/TP/FN/FP : 7/10/10/15
Extract Superclass: 0.857 -- TN/TP/FN/FP : 36/0/6/0
Inline Method: 0.881 -- TN/TP/FN/FP : 37/0/5/0
Move Attribute: 0.738 -- TN/TP/FN/FP : 31/0/11/0
Move Class: 0.548 -- TN/TP/FN/FP : 23/0/19/0
Move Method: 0.738 -- TN/TP/FN/FP : 31/0/11/0
Pull Up Method: 0.857 -- TN/TP/FN/FP : 36/0/6/0
Rename Package: 0.833 -- TN/TP/FN/FP : 35/0/7/0


In [15]:
folder_files = 'data_diffs_clean\\'
ref_details = pd.read_csv('analysis_results\\refactoring_details_post.csv')
all_present_actions = ['delete-tree', 'insert-node', 'move-tree', 'delete-node', 'update-node', 'insert-tree']
ref_details.head()

Unnamed: 0,id,Inline Method,Extract Method,Move Class,Extract Interface,Rename Package,Pull Up Method,Pull Up Attribute,Move Attribute,Move Method,Extract Superclass,Push Down Attribute,Push Down Method
0,1105075,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1105947,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1106350,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1117151,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1116977,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
import os
import pandas as pd
from collections import Counter

# Path to the folder containing edit script files
folder_path = folder_files

def count_file_content(lines, actions):
    blocks = []
    current_block = []
    for line in lines:
        if line.strip() == '===':
            if current_block != []:
                blocks.append(current_block)
                current_block = []
        current_block.append(line)

    for block in blocks:
        action = block[1].strip()
        if action in actions:
            actions[action] += 1
    
    return actions

# Function to parse edit script files and count edit actions
def parse_edit_script(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    actions = {action: 0 for action in all_present_actions}
    action_counts = count_file_content(content.split('\n'), actions)
    return action_counts

# Creating a dataframe with action counts
edit_script_data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".md"):
        file_path = os.path.join(folder_path, filename)
        action_counts = parse_edit_script(file_path)
        edit_script_data.append({'id': int(filename.split('_')[0]), **action_counts})

edit_script_df = pd.DataFrame(edit_script_data).fillna(0)
edit_script_df.head()

Unnamed: 0,id,delete-tree,insert-node,move-tree,delete-node,update-node,insert-tree
0,1100435,12,82,37,62,16,23
1,1100663,0,9,4,0,0,1
2,1100842,0,16,10,7,3,11
3,1100856,36,124,59,68,33,55
4,1100868,23,94,40,76,40,39


In [17]:
# Load the existing dataframe with refactoring counts
refactoring_df = ref_details

# Merge the two dataframes on 'id'
merged_df = pd.merge(edit_script_df, refactoring_df, on='id')

# Create a binary label for 'Extract Method' presence
merged_df['Extract_Method_Label'] = merged_df['Extract Method'] > 0
merged_df.head()

Unnamed: 0,id,delete-tree,insert-node,move-tree,delete-node,update-node,insert-tree,Inline Method,Extract Method,Move Class,Extract Interface,Rename Package,Pull Up Method,Pull Up Attribute,Move Attribute,Move Method,Extract Superclass,Push Down Attribute,Push Down Method,Extract_Method_Label
0,1100435,12,82,37,62,16,23,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,1100663,0,9,4,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,1100842,0,16,10,7,3,11,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,1100856,36,124,59,68,33,55,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,1100868,23,94,40,76,40,39,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [64]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define feature columns and target column
features = ['delete-tree', 'insert-node', 'move-tree', 'delete-node', 'update-node', 'insert-tree']
target = 'Extract_Method_Label'

# Split the data
X = merged_df[features]
y = merged_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=17)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8108108108108109


In [63]:
top_10_acc = [0 for i in range(10)]
top_10_random = [0 for i in range(10)]
for random_state in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    clf = RandomForestClassifier(random_state=random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    if acc > min(top_10_acc):
        min_index = top_10_acc.index(min(top_10_acc))
        top_10_acc[min_index] = acc
        top_10_random[min_index] = random_state
print(top_10_acc)
print(top_10_random)

[0.8108108108108109, 0.8108108108108109, 0.8108108108108109, 0.8108108108108109, 0.8108108108108109, 0.8108108108108109, 0.8108108108108109, 0.8108108108108109, 0.8378378378378378, 0.8108108108108109]
[17, 107, 258, 295, 129, 311, 408, 242, 507, 98]
