### original w/ count vectorizer

In [1]:
from sklearn.metrics import confusion_matrix
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import confusion_matrix, accuracy_score

def load_data(data_folder, refactoring_details_path):
    ref_details = pd.read_csv(refactoring_details_path)
    train_documents = []
    test_documents = []
    train_ids = []
    test_ids = []
    training_labels = {}
    testing_labels = {}

    # Initialize training labels for all refactoring types
    ref_types = [d.replace('_single', '') for d in os.listdir(data_folder) if '_single' in d]
    for ref_type in ref_types:
        training_labels[ref_type] = []
        testing_labels[ref_type] = []

    # Load training data from _single directories and assign labels
    for ref_type in os.listdir(data_folder):
        if '_single' in ref_type:
            ref_type_clean = ref_type.replace('_single', '')
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        train_documents.append(content)
                        train_ids.append(filename.split('_')[0])
                        # Initialize labels for all refactoring types as 0
                        for key in training_labels:
                            training_labels[key].append(0)
                        # Set the current refactoring type label to 1
                        training_labels[ref_type_clean][-1] = 1

    # Load testing data from _mixed directories and assign labels based on CSV
    for ref_type in os.listdir(data_folder):
        if '_mixed' in ref_type:
            ref_type_clean = ref_type.replace('_mixed', '')
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        test_documents.append(content)
                        file_id = filename.split('_')[0]
                        test_ids.append(file_id) 
                        # Initialize labels for all refactoring types as 0
                        for key in testing_labels:
                            testing_labels[key].append(0)
                        # Set labels based on occurrence counts from the CSV
                        for key in ref_types:
                            count = ref_details.loc[ref_details['id'] == int(file_id), key].values[0]
                            if count > 0:
                                testing_labels[key][-1] = 1

    return train_documents, train_ids, test_documents, test_ids, training_labels, testing_labels

def train_and_evaluate(train_docs, train_labels, test_docs, test_labels, path_save='analysis_results\\'):
    vectorizer=CountVectorizer()
    X_train = vectorizer.fit_transform(train_docs)
    X_test = vectorizer.transform(test_docs)
    classifiers = {}
    results = {}
    summary = {}
    
    for refactoring_type, labels in train_labels.items():
        clf = RandomForestClassifier()
        clf.fit(X_train, labels)
        classifiers[refactoring_type] = clf

        # Evaluate classifier
        predictions = clf.predict(X_test)
        true = np.array(test_labels[refactoring_type])
        cm = confusion_matrix(true, predictions)
        acc = accuracy_score(true, predictions)
        
        # Store results
        results[refactoring_type] = {
            'confusion_matrix': cm,
            'accuracy': acc
        }
        summary[refactoring_type] = [acc, *cm.ravel()]  # Flatten confusion matrix and prepend accuracy

    # Serialize results and summary
    with open(path_save + 'classification_summary.pkl', 'wb') as f:
        pickle.dump(summary, f)
    with open(path_save + 'classification_results.txt', 'w') as f:
        f.write(str(results))
    
    return classifiers, results

data_folder = "dataset_clean\\"
refactoring_details_path = 'analysis_results\\refactoring_details_post.csv'
train_docs, train_ids, test_docs, test_ids, train_labels, test_labels = load_data(data_folder, refactoring_details_path)
classifiers, results = train_and_evaluate(train_docs, train_labels, test_docs, test_labels)

print("Classification results saved successfully.")

print("Accuracy  and TN/TP/FN/FP for each refactoring type:")
for refactoring_type, result in results.items():
    print(f"{refactoring_type}: {round(result['accuracy'],3)} -- TN/TP/FN/FP : {result['confusion_matrix'][0][0]}/{result['confusion_matrix'][1][1]}/{result['confusion_matrix'][1][0]}/{result['confusion_matrix'][0][1]}")

Classification results saved successfully.
Accuracy  and TN/TP/FN/FP for each refactoring type:
Extract Interface: 0.881 -- TN/TP/FN/FP : 37/0/5/0
Extract Method: 0.405 -- TN/TP/FN/FP : 7/10/10/15
Extract Superclass: 0.857 -- TN/TP/FN/FP : 36/0/6/0
Inline Method: 0.881 -- TN/TP/FN/FP : 37/0/5/0
Move Attribute: 0.738 -- TN/TP/FN/FP : 31/0/11/0
Move Class: 0.548 -- TN/TP/FN/FP : 23/0/19/0
Move Method: 0.738 -- TN/TP/FN/FP : 31/0/11/0
Pull Up Method: 0.857 -- TN/TP/FN/FP : 36/0/6/0
Rename Package: 0.833 -- TN/TP/FN/FP : 35/0/7/0


### w/ action count (0.2)

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

folder_path = 'data_diffs_clean\\'
refactoring_df = pd.read_csv('analysis_results\\refactoring_details_post.csv')
all_present_actions = ['delete-tree', 'insert-node', 'move-tree', 'delete-node', 'update-node', 'insert-tree']
refactorings = refactoring_df.columns[1:]

# Function to count the occurrences of each action in a file
def count_file_content(lines, actions):
    blocks = []
    current_block = []
    for line in lines:
        if line.strip() == '===':
            if current_block != []:
                blocks.append(current_block)
                current_block = []
        current_block.append(line)

    for block in blocks:
        action = block[1].strip()
        if action in actions:
            actions[action] += 1
    
    return actions

# Function to parse edit script files and count edit actions
def parse_edit_script(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    actions = {action: 0 for action in all_present_actions}
    action_counts = count_file_content(content.split('\n'), actions)
    return action_counts

# Creating a dataframe with action counts
edit_script_data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".md"):
        file_path = os.path.join(folder_path, filename)
        action_counts = parse_edit_script(file_path)
        edit_script_data.append({'id': int(filename.split('_')[0]), **action_counts})

edit_script_df = pd.DataFrame(edit_script_data).fillna(0)

# Merge the two dataframes on 'id'
merged_df = pd.merge(edit_script_df, refactoring_df, on='id')

# Create a binary label for refactoring presence
for refactoring in refactorings:
    merged_df[f'{refactoring}_Label'] = merged_df[refactoring] > 0

# Train and evaluate a RandomForestClassifier for each refactoring
clfs = {}
cms = {}
accs = {}
summary = {}

for refactoring in refactorings:
    features = all_present_actions
    target = f'{refactoring}_Label'

    X = merged_df[features]
    y = merged_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

    clf = RandomForestClassifier(n_estimators=100, random_state=17)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    if cm.shape == (1, 1):
        cm = np.array([[cm[0][0], 0], [0, 0]])

    clfs[refactoring] = clf
    cms[refactoring] = cm
    accs[refactoring] = acc
    summary[refactoring] = [acc, *cm.ravel()]

# Save the results
with open('cluster_results\\classification_accuracy.txt', 'w') as f:
    f.write(str(accs))
with open('cluster_results\\classification_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

print("Accuracy  and TN/TP/FN/FP for each refactoring type:")
for ref in accs:
    print(f"{ref}: {round(accs[ref],3)} -- TN/TP/FN/FP : {cms[ref][0][0]}/{cms[ref][1][1]}/{cms[ref][1][0]}/{cms[ref][0][1]}")

Accuracy  and TN/TP/FN/FP for each refactoring type:
Inline Method: 0.865 -- TN/TP/FN/FP : 32/0/5/0
Extract Method: 0.811 -- TN/TP/FN/FP : 11/19/3/4
Move Class: 0.865 -- TN/TP/FN/FP : 32/0/3/2
Extract Interface: 0.946 -- TN/TP/FN/FP : 35/0/2/0
Rename Package: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Pull Up Method: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Pull Up Attribute: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Move Attribute: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Move Method: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Extract Superclass: 0.946 -- TN/TP/FN/FP : 35/0/2/0
Push Down Attribute: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Push Down Method: 1.0 -- TN/TP/FN/FP : 37/0/0/0




### w/ action count balance y (0.2)

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

folder_path = 'data_diffs_clean\\'
refactoring_df = pd.read_csv('analysis_results\\refactoring_details_post.csv')
all_present_actions = ['delete-tree', 'insert-node', 'move-tree', 'delete-node', 'update-node', 'insert-tree']
refactorings = refactoring_df.columns[1:]

# Function to count the occurrences of each action in a file
def count_file_content(lines, actions):
    blocks = []
    current_block = []
    for line in lines:
        if line.strip() == '===':
            if current_block != []:
                blocks.append(current_block)
                current_block = []
        current_block.append(line)

    for block in blocks:
        action = block[1].strip()
        if action in actions:
            actions[action] += 1
    
    return actions

# Function to parse edit script files and count edit actions
def parse_edit_script(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    actions = {action: 0 for action in all_present_actions}
    action_counts = count_file_content(content.split('\n'), actions)
    return action_counts

# Creating a dataframe with action counts
edit_script_data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".md"):
        file_path = os.path.join(folder_path, filename)
        action_counts = parse_edit_script(file_path)
        edit_script_data.append({'id': int(filename.split('_')[0]), **action_counts})

edit_script_df = pd.DataFrame(edit_script_data).fillna(0)

# Merge the two dataframes on 'id'
merged_df = pd.merge(edit_script_df, refactoring_df, on='id')

# Create a binary label for refactoring presence
for refactoring in refactorings:
    merged_df[f'{refactoring}_Label'] = merged_df[refactoring] > 0

# Train and evaluate a RandomForestClassifier for each refactoring
clfs = {}
cms = {}
accs = {}
summary = {}

for refactoring in refactorings:
    features = all_present_actions
    target = f'{refactoring}_Label'

    X = merged_df[features]
    y = merged_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)

    clf = RandomForestClassifier(n_estimators=100, random_state=17)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    if cm.shape == (1, 1):
        cm = np.array([[cm[0][0], 0], [0, 0]])

    clfs[refactoring] = clf
    cms[refactoring] = cm
    accs[refactoring] = acc
    summary[refactoring] = [acc, *cm.ravel()]

# Save the results
with open('cluster_results\\classification_accuracy.txt', 'w') as f:
    f.write(str(accs))
with open('cluster_results\\classification_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

print("Accuracy  and TN/TP/FN/FP for each refactoring type:")
for ref in accs:
    print(f"{ref}: {round(accs[ref],3)} -- TN/TP/FN/FP : {cms[ref][0][0]}/{cms[ref][1][1]}/{cms[ref][1][0]}/{cms[ref][0][1]}")

Accuracy  and TN/TP/FN/FP for each refactoring type:
Inline Method: 0.919 -- TN/TP/FN/FP : 34/0/3/0
Extract Method: 0.757 -- TN/TP/FN/FP : 7/21/1/8
Move Class: 0.784 -- TN/TP/FN/FP : 29/0/5/3
Extract Interface: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Rename Package: 1.0 -- TN/TP/FN/FP : 36/1/0/0
Pull Up Method: 0.946 -- TN/TP/FN/FP : 35/0/2/0
Pull Up Attribute: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Move Attribute: 0.919 -- TN/TP/FN/FP : 34/0/2/1
Move Method: 0.946 -- TN/TP/FN/FP : 35/0/2/0
Extract Superclass: 0.946 -- TN/TP/FN/FP : 35/0/2/0
Push Down Attribute: 0.973 -- TN/TP/FN/FP : 36/0/1/0
Push Down Method: 1.0 -- TN/TP/FN/FP : 37/0/0/0




### w/ AC and start y (0.3)

In [24]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

folder_path = 'data_diffs_clean\\'
refactoring_df = pd.read_csv('analysis_results\\refactoring_details_post.csv')
all_present_actions = ['delete-tree', 'insert-node', 'move-tree', 'delete-node', 'update-node', 'insert-tree']
refactorings = refactoring_df.columns[1:]

# Function to count the occurrences of each action in a file
def count_file_content(lines, actions):
    blocks = []
    current_block = []
    for line in lines:
        if line.strip() == '===':
            if current_block != []:
                blocks.append(current_block)
                current_block = []
        current_block.append(line)

    for block in blocks:
        action = block[1].strip()
        if action in actions:
            actions[action] += 1
    
    return actions

# Function to parse edit script files and count edit actions
def parse_edit_script(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    actions = {action: 0 for action in all_present_actions}
    action_counts = count_file_content(content.split('\n'), actions)
    return action_counts

# Creating a dataframe with action counts
edit_script_data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".md"):
        file_path = os.path.join(folder_path, filename)
        action_counts = parse_edit_script(file_path)
        edit_script_data.append({'id': int(filename.split('_')[0]), **action_counts})

edit_script_df = pd.DataFrame(edit_script_data).fillna(0)

# Merge the two dataframes on 'id'
merged_df = pd.merge(edit_script_df, refactoring_df, on='id')

# Create a binary label for refactoring presence
for refactoring in refactorings:
    merged_df[f'{refactoring}_Label'] = merged_df[refactoring] > 0

# Train and evaluate a RandomForestClassifier for each refactoring
clfs = {}
cms = {}
accs = {}
summary = {}

for refactoring in refactorings:
    print(refactoring)
    features = all_present_actions
    target = f'{refactoring}_Label'

    X = merged_df[features]
    y = merged_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17, stratify=y)

    clf = RandomForestClassifier(n_estimators=100, random_state=17)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    if cm.shape == (1, 1):
        cm = np.array([[cm[0][0], 0], [0, 0]])

    clfs[refactoring] = clf
    cms[refactoring] = cm
    accs[refactoring] = acc
    summary[refactoring] = [acc, *cm.ravel()]

# Save the results
with open('cluster_results\\classification_accuracy.txt', 'w') as f:
    f.write(str(accs))
with open('cluster_results\\classification_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

print("Accuracy  and TN/TP/FN/FP for each refactoring type:")
for ref in accs:
    print(f"{ref}: {round(accs[ref],3)} -- TN/TP/FN/FP : {cms[ref][0][0]}/{cms[ref][1][1]}/{cms[ref][1][0]}/{cms[ref][0][1]}")

Inline Method
Extract Method
Move Class
Extract Interface
Rename Package
Pull Up Method
Pull Up Attribute
Move Attribute
Move Method
Extract Superclass
Push Down Attribute
Push Down Method
Accuracy  and TN/TP/FN/FP for each refactoring type:
Inline Method: 0.909 -- TN/TP/FN/FP : 50/0/5/0
Extract Method: 0.709 -- TN/TP/FN/FP : 11/28/4/12
Move Class: 0.8 -- TN/TP/FN/FP : 44/0/8/3
Extract Interface: 0.982 -- TN/TP/FN/FP : 54/0/1/0
Rename Package: 0.982 -- TN/TP/FN/FP : 53/1/1/0
Pull Up Method: 0.945 -- TN/TP/FN/FP : 52/0/2/1
Pull Up Attribute: 0.945 -- TN/TP/FN/FP : 52/0/2/1
Move Attribute: 0.927 -- TN/TP/FN/FP : 51/0/4/0
Move Method: 0.909 -- TN/TP/FN/FP : 49/1/2/3
Extract Superclass: 0.964 -- TN/TP/FN/FP : 53/0/2/0
Push Down Attribute: 0.982 -- TN/TP/FN/FP : 54/0/1/0
Push Down Method: 0.982 -- TN/TP/FN/FP : 54/0/1/0


### w/ SMOTE

In [18]:
import os
from imblearn.over_sampling import SMOTE
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score

folder_path = 'data_diffs_clean\\'
refactoring_df = pd.read_csv('analysis_results\\refactoring_details_post.csv')
all_present_actions = ['delete-tree', 'insert-node', 'move-tree', 'delete-node', 'update-node', 'insert-tree']
refactorings = refactoring_df.columns[1:]

# Function to count the occurrences of each action in a file
def count_file_content(lines, actions):
    blocks = []
    current_block = []
    for line in lines:
        if line.strip() == '===':
            if current_block != []:
                blocks.append(current_block)
                current_block = []
        current_block.append(line)

    for block in blocks:
        action = block[1].strip()
        if action in actions:
            actions[action] += 1
    
    return actions

# Function to parse edit script files and count edit actions
def parse_edit_script(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    actions = {action: 0 for action in all_present_actions}
    action_counts = count_file_content(content.split('\n'), actions)
    return action_counts

# Creating a dataframe with action counts
edit_script_data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".md"):
        file_path = os.path.join(folder_path, filename)
        action_counts = parse_edit_script(file_path)
        edit_script_data.append({'id': int(filename.split('_')[0]), **action_counts})

edit_script_df = pd.DataFrame(edit_script_data).fillna(0)

# Merge the two dataframes on 'id'
merged_df = pd.merge(edit_script_df, refactoring_df, on='id')

# Create a binary label for refactoring presence
for refactoring in refactorings:
    merged_df[f'{refactoring}_Label'] = merged_df[refactoring] > 0

# Train and evaluate a RandomForestClassifier for each refactoring
clfs = {}
cms = {}
accs = {}
summary = {}

for refactoring in refactorings:
    features = all_present_actions
    target = f'{refactoring}_Label'

    X = merged_df[features]
    y = merged_df[target]

    # skip refactoring types with less than 10 samples
    if y.sum() < 10:
        print(f"Skipping {refactoring} due to low sample count")
        continue

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, stratify=y)

    # Apply SMOTE to the training data
    smote = SMOTE(random_state=17)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

    clf = RandomForestClassifier(n_estimators=100, random_state=17)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    if cm.shape == (1, 1):
        cm = np.array([[cm[0][0], 0], [0, 0]])

    clfs[refactoring] = clf
    cms[refactoring] = cm
    accs[refactoring] = acc
    summary[refactoring] = [acc, *cm.ravel()]

# Save the results
with open('cluster_results\\classification_accuracy.txt', 'w') as f:
    f.write(str(accs))
with open('cluster_results\\classification_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

print("\nAccuracy  and TN/TP/FN/FP for each refactoring type:")
for ref in accs:
    print(f"{ref}: {round(accs[ref],3)} -- TN/TP/FN/FP : {cms[ref][0][0]}/{cms[ref][1][1]}/{cms[ref][1][0]}/{cms[ref][0][1]}")

Skipping Extract Interface due to low sample count
Skipping Rename Package due to low sample count
Skipping Pull Up Method due to low sample count
Skipping Pull Up Attribute due to low sample count
Skipping Extract Superclass due to low sample count
Skipping Push Down Attribute due to low sample count
Skipping Push Down Method due to low sample count

Accuracy  and TN/TP/FN/FP for each refactoring type:
Inline Method: 0.913 -- TN/TP/FN/FP : 42/0/4/0
Extract Method: 0.717 -- TN/TP/FN/FP : 8/25/2/11
Move Class: 0.783 -- TN/TP/FN/FP : 36/0/7/3
Move Attribute: 0.935 -- TN/TP/FN/FP : 43/0/3/0
Move Method: 0.913 -- TN/TP/FN/FP : 41/1/2/2


### w/ new features

In [23]:
import os
from imblearn.over_sampling import SMOTE
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score

folder_path = 'data_diffs_clean\\'
refactoring_df = pd.read_csv('analysis_results\\refactoring_details_post.csv')
all_present_actions = ['delete-tree', 'insert-node', 'move-tree', 'delete-node', 'update-node', 'insert-tree']
refactorings = refactoring_df.columns[1:]

# Function to count the occurrences of each action in a file
def count_file_content(lines, actions):
    blocks = []
    current_block = []
    for line in lines:
        if line.strip() == '===':
            if current_block != []:
                blocks.append(current_block)
                current_block = []
        current_block.append(line)

    for block in blocks:
        action = block[1].strip()
        if action in actions:
            actions[action] += 1
    
    return actions

# Function to parse edit script files and count edit actions
def parse_edit_script(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    actions = {action: 0 for action in all_present_actions}
    action_counts = count_file_content(content.split('\n'), actions)
    return action_counts

# Creating a dataframe with action counts
edit_script_data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".md"):
        file_path = os.path.join(folder_path, filename)
        action_counts = parse_edit_script(file_path)
        edit_script_data.append({'id': int(filename.split('_')[0]), **action_counts})

edit_script_df = pd.DataFrame(edit_script_data).fillna(0)

edit_script_df['ratio-node'] = 1/(edit_script_df['insert-node'] / edit_script_df['delete-node'])
edit_script_df['ratio-tree'] = 1/(edit_script_df['insert-tree'] / edit_script_df['delete-tree'])
edit_script_df = edit_script_df.replace([np.inf, -np.inf], np.nan).fillna(0)

# Merge the two dataframes on 'id'
merged_df = pd.merge(edit_script_df, refactoring_df, on='id')

# Create a binary label for refactoring presence
for refactoring in refactorings:
    merged_df[f'{refactoring}_Label'] = merged_df[refactoring] > 0

# Train and evaluate a RandomForestClassifier for each refactoring
clfs = {}
cms = {}
accs = {}
summary = {}

for refactoring in refactorings:
    features = all_present_actions + ['ratio-node', 'ratio-tree']
    target = f'{refactoring}_Label'

    X = merged_df[features]
    y = merged_df[target]

    # skip refactoring types with less than 10 samples
    if y.sum() < 10:
        print(f"Skipping {refactoring} due to low sample count")
        continue

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, stratify=y, test_size=0.4)

    # Apply SMOTE to the training data
    smote = SMOTE(random_state=17)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

    clf = RandomForestClassifier(n_estimators=100, random_state=17)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    if cm.shape == (1, 1):
        cm = np.array([[cm[0][0], 0], [0, 0]])

    clfs[refactoring] = clf
    cms[refactoring] = cm
    accs[refactoring] = acc
    summary[refactoring] = [acc, *cm.ravel()]

# Save the results
with open('cluster_results\\classification_accuracy.txt', 'w') as f:
    f.write(str(accs))
with open('cluster_results\\classification_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

print("\nAccuracy  and TN/TP/FN/FP for each refactoring type:")
for ref in accs:
    print(f"{ref}: {round(accs[ref],3)} -- TN/TP/FN/FP : {cms[ref][0][0]}/{cms[ref][1][1]}/{cms[ref][1][0]}/{cms[ref][0][1]}")

Skipping Extract Interface due to low sample count
Skipping Rename Package due to low sample count
Skipping Pull Up Method due to low sample count
Skipping Pull Up Attribute due to low sample count
Skipping Extract Superclass due to low sample count
Skipping Push Down Attribute due to low sample count
Skipping Push Down Method due to low sample count

Accuracy  and TN/TP/FN/FP for each refactoring type:
Inline Method: 0.918 -- TN/TP/FN/FP : 67/0/6/0
Extract Method: 0.658 -- TN/TP/FN/FP : 12/36/7/18
Move Class: 0.753 -- TN/TP/FN/FP : 55/0/11/7
Move Attribute: 0.932 -- TN/TP/FN/FP : 68/0/5/0
Move Method: 0.904 -- TN/TP/FN/FP : 65/1/3/4


In [None]:
param_grids = {
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'LogisticRegression': {'C': [0.1, 1, 10]}
}