In [3]:
# !pip install nltk
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer

# Directory where the edit scripts are stored
folder_path = 'dataset\\Extract Method_single\\'

# Step 1: Load and preprocess data
def load_scripts(folder_path):
    scripts = []
    file_names = []
    tokenizer = RegexpTokenizer(r'\w+')
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Assuming the scripts are in text format
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                content = file.read()
                tokens = tokenizer.tokenize(content)
                scripts.append(' '.join(tokens))
                file_names.append(filename)
    
    return scripts, file_names

# Step 2 and 3: Extract features and split data
def extract_features(scripts):
    vectorizer = CountVectorizer(max_features=1000)  
    X = vectorizer.fit_transform(scripts)
    return X, vectorizer.get_feature_names()

# Step 4: Identify MEAs based on the training set
def identify_meas(features, feature_names):
    # Simple frequency-based approach to identify MEAs
    feature_sums = np.sum(features, axis=0)
    sorted_indices = np.argsort(feature_sums)[::-1]
    top_features = [feature_names[i] for i in sorted_indices[:10]]  # Top 10 MEAs
    return top_features

# Step 5: Test the MEAs on the test set
def test_meas(features, meas, feature_names):
    mea_indices = [feature_names.index(mea) for mea in meas if mea in feature_names]
    test_results = features[:, mea_indices].toarray()
    presence = np.sum(test_results, axis=1) > 0
    return np.mean(presence)

# Main function to run the script
def main():
    scripts, file_names = load_scripts(folder_path)
    features, feature_names = extract_features(scripts)
    X_train, X_test, _, _ = train_test_split(features, scripts, test_size=0.2, random_state=42)
    
    meas = identify_meas(X_train, feature_names)
    effectiveness = test_meas(X_test, meas, feature_names)
    
    print("Identified MEAs:", meas)
    print("Effectiveness on Test Set:", effectiveness)


main()

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.8.1


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [4]:
import os
import difflib

def parse_diff_files(path):
    edits = []
    with open(path, 'r') as file:
        lines = file.readlines()
    # This is a very basic parser, consider using GumTree or similar for real use
    for line in lines:
        if line.startswith('+'):
            edits.append(('insert', line[1:].strip()))
        elif line.startswith('-'):
            edits.append(('delete', line[1:].strip()))
    return edits

def extract_features(edit_scripts):
    # This function should implement the clustering logic to find MEAs
    # Placeholder for the actual clustering and feature extraction logic
    features = set()
    for script in edit_scripts:
        for edit in script:
            features.add(edit)
    return features

def check_features_in_scripts(features, scripts):
    results = []
    for script in scripts:
        script_features = set(script)
        results.append(features.intersection(script_features))
    return results

# Splitting data into training and testing
data_dir = 'dataset\\Extract Method_single\\'
all_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir)]
train_files = all_files[:70]  # ~70% for training
test_files = all_files[70:]   # ~30% for testing

# Training phase
train_edit_scripts = [parse_diff_files(file) for file in train_files]
train_features = extract_features(train_edit_scripts)

# Testing phase
test_edit_scripts = [parse_diff_files(file) for file in test_files]
test_results = check_features_in_scripts(train_features, test_edit_scripts)

print("Test Results:", test_results)

Test Results: [{('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}, {('delete', '--')}]


In [5]:
import os
import random

# List all diff files
diff_files = [f for f in os.listdir('dataset\\Extract Method_single\\') if f.endswith('.md')]
random.shuffle(diff_files)

# Split into training and testing sets (80-20 split)
split_index = int(0.8 * len(diff_files))
training_files = diff_files[:split_index]
testing_files = diff_files[split_index:]

print("Training files:", len(training_files))
print("Testing files:", len(testing_files))

Training files: 78
Testing files: 20


In [7]:
from collections import Counter

def extract_edit_operations(file_path):
    operations = []
    with open(file_path, 'r') as file:
        data = file.readlines()
        for line in data:
            if any(op in line for op in ['insert', 'delete', 'update', 'move']):
                operations.append(line.strip())
    return operations

# Collect all operations from training files
all_operations = []
for file_name in training_files:
    file_path = os.path.join('dataset\\Extract Method_single\\', file_name)
    all_operations.extend(extract_edit_operations(file_path))

# Find common edit operations (minimal edit actions)
operation_counts = Counter(all_operations)
common_operations = [op for op, count in operation_counts.items() if count > len(training_files) * 0.5]  # Threshold at 50% occurrence

print("Common operations (MEAs):", common_operations)

Common operations (MEAs): ['insert-tree', 'insert-node', 'move-tree', 'update-node', 'delete-node', 'delete-tree']


In [8]:
def validate_operations(test_file, common_operations):
    test_operations = extract_edit_operations(test_file)
    return any(op in test_operations for op in common_operations)

# Test the MEAs in the testing set
results = {}
for file_name in testing_files:
    file_path = os.path.join('dataset\\Extract Method_single\\', file_name)
    results[file_name] = validate_operations(file_path, common_operations)

print("Validation results:", results)

Validation results: {'1123527_diff.md': True, '1116770_diff.md': True, '1122401_diff.md': True, '1118835_diff.md': True, '1117840_diff.md': True, '1121499_diff.md': True, '1121732_diff.md': True, '1120442_diff.md': True, '1122130_diff.md': True, '1120023_diff.md': True, '1123501_diff.md': True, '1111984_diff.md': True, '1116609_diff.md': True, '1110245_diff.md': True, '1119661_diff.md': True, '1117885_diff.md': True, '1102793_diff.md': True, '1120077_diff.md': True, '1121850_diff.md': True, '1121836_diff.md': True}


In [14]:
import os
import json
import random
from collections import Counter
from sklearn.metrics import confusion_matrix
import numpy as np

# Function to extract operations from diff files
def extract_edit_operations(file_path):
    operations = []
    with open(file_path, 'r') as file:
        data = file.readlines()
        for line in data:
            if any(op in line for op in ['insert', 'delete', 'update', 'move']):
                operations.append(line.strip())
    return operations

# Function to validate operations in test files
def validate_operations(test_file, common_operations):
    test_operations = extract_edit_operations(test_file)
    return any(op in test_operations for op in common_operations)

# Set the base directory for the dataset
dataset_dir = "dataset_clean/"

# Gather available refactoring types
refactoring_types = [d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d)) and '_single' in d]
print("Available refactoring types:")
for ref_type in refactoring_types:
    print(ref_type.replace('_single', ''))

# User selects refactoring type for analysis
# ref_type_to_analyze = input("\nEnter the refactoring type to analyze: ") + '_single'
ref_type_to_analyze = 'Extract Method_single'

# Prepare the file lists
file_paths = [os.path.join(dataset_dir, ref_type_to_analyze, f) for f in os.listdir(os.path.join(dataset_dir, ref_type_to_analyze)) if f.endswith('_diff.md')]
random.shuffle(file_paths)

# Split files into training and testing sets
train_size = len(file_paths) // 2
training_files = file_paths[:train_size]
testing_files = file_paths[train_size:]

# Collect all operations from training files
all_operations = []
for file_path in training_files:
    all_operations.extend(extract_edit_operations(file_path))

# Determine common operations at different thresholds
thresholds = [0.2, 0.5, 0.7]
results = []

for threshold in thresholds:
    operation_counts = Counter(all_operations)
    common_operations = [op for op, count in operation_counts.items() if count >= len(training_files) * threshold]

    print(common_operations)

    # Test operations in the testing set
    y_true = [1] * len(testing_files)  # All are positive examples
    y_pred = [validate_operations(f, common_operations) for f in testing_files]

    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    accuracy = np.trace(cm) / np.sum(cm)
    results.append((threshold, accuracy, cm))

for res in results:
    print(f"Threshold: {res[0]}, Accuracy: {res[1]}, Confusion Matrix:\n{res[2]}")

print("Process completed.")

Available refactoring types:
Extract Interface
Extract Method
Extract Superclass
Inline Method
Move Attribute
Move Class
Move Method
Pull Up Attribute
Pull Up Method
Push Down Attribute
Push Down Method
Rename Package
['move-tree', 'insert-node', 'insert-tree', 'update-node', 'delete-node', 'delete-tree']
['move-tree', 'insert-node', 'insert-tree', 'update-node', 'delete-node', 'delete-tree']
['move-tree', 'insert-node', 'insert-tree', 'update-node', 'delete-node', 'delete-tree']
Threshold: 0.2, Accuracy: 1.0, Confusion Matrix:
[[49]]
Threshold: 0.5, Accuracy: 1.0, Confusion Matrix:
[[49]]
Threshold: 0.7, Accuracy: 1.0, Confusion Matrix:
[[49]]
Process completed.




In [20]:
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def load_data_and_labels(data_folder):
    # Load all files and their labels
    refactoring_types = [d for d in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, d))]
    documents = []
    labels = []

    for ref_type in refactoring_types:
        subfolder_path = os.path.join(data_folder, ref_type)
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.md'):
                file_path = os.path.join(subfolder_path, filename)
                with open(file_path, 'r') as file:
                    content = file.read()
                    documents.append(content)
                    labels.append(ref_type)
    return documents, labels, refactoring_types

def extract_features(documents):
    # Fit the CountVectorizer to all documents to create a global vocabulary
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform(documents).toarray()
    return features, vectorizer.get_feature_names_out()

data_folder = "dataset_clean\\"
documents, labels, refactoring_types = load_data_and_labels(data_folder)
features, feature_names = extract_features(documents)

# Convert labels to a binary format for each refactoring type
classifiers = {}
for ref_type in refactoring_types:
    binary_labels = [1 if label == ref_type else 0 for label in labels]
    X_train, X_test, y_train, y_test = train_test_split(features, binary_labels, test_size=0.3, random_state=42)
    
    # Train a RandomForest classifier
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    classifiers[ref_type] = clf
    
    # Evaluate the classifier
    predictions = clf.predict(X_test)
    print(f"Classification report for {ref_type}:")
    print(classification_report(y_test, predictions))

# Function to classify new scripts
def classify_new_script(script_path, ref_type, vectorizer):
    with open(script_path, 'r') as file:
        content = file.read()
    file_features = vectorizer.transform([content]).toarray()
    clf = classifiers[ref_type]
    prediction = clf.predict(file_features)
    return bool(prediction[0])

# Example usage
new_script_path = "dataset\\Extract Method_mixed\\1107345_diff.md"
for ref_type in refactoring_types:
    result = classify_new_script(new_script_path, ref_type)
    print(f"Is the new script {ref_type}? {result}")

Classification report for Extract Interface_mixed:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        60
           1       0.00      0.00      0.00         0

    accuracy                           0.98        60
   macro avg       0.50      0.49      0.50        60
weighted avg       1.00      0.98      0.99        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Extract Interface_single:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

Classification report for Extract Method_mixed:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96        58
           1       0.00      0.00      0.00         2

    accuracy                           0.92        60
   macro avg       0.48      0.47      0.48        60
weighted avg       0.93      0.92      0.92        60

Classification report for Extract Method_single:
              precision    recall  f1-score   support

           0       0.80      0.38      0.51        32
           1       0.56      0.89      0.68        28

    accuracy                           0.62        60
   macro avg       0.68      0.63

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Extract Superclass_single:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Inline Method_mixed:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Inline Method_single:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        54
           1       0.00      0.00      0.00         6

    accuracy                           0.90        60
   macro avg       0.45      0.50      0.47        60
weighted avg       0.81      0.90      0.85        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Move Attribute_mixed:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        59
           1       0.00      0.00      0.00         1

    accuracy                           0.97        60
   macro avg       0.49      0.49      0.49        60
weighted avg       0.97      0.97      0.97        60

Classification report for Move Attribute_single:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        58
           1       0.00      0.00      0.00         2

    accuracy                           0.97        60
   macro avg       0.48      0.50      0.49        60
weighted avg       0.93      0.97      0.95        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Move Class_mixed:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        58
           1       0.00      0.00      0.00         2

    accuracy                           0.93        60
   macro avg       0.48      0.48      0.48        60
weighted avg       0.93      0.93      0.93        60

Classification report for Move Class_single:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        52
           1       0.00      0.00      0.00         8

    accuracy                           0.87        60
   macro avg       0.43      0.50      0.46        60
weighted avg       0.75      0.87      0.80        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Move Method_mixed:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        58
           1       0.00      0.00      0.00         2

    accuracy                           0.93        60
   macro avg       0.48      0.48      0.48        60
weighted avg       0.93      0.93      0.93        60

Classification report for Move Method_single:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Pull Up Method_mixed:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

Classification report for Pull Up Method_single:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        58
           1       0.00      0.00      0.00         2

    accuracy                           0.97        60
   macro avg       0.48      0.50      0.49        60
weighted avg       0.93      0.97      0.95        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Rename Package_mixed:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Rename Package_single:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TypeError: classify_new_script() missing 1 required positional argument: 'vectorizer'

In [21]:
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def load_data_and_labels(data_folder):
    # Load all files and their labels
    refactoring_types = [d for d in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, d))]
    documents = []
    labels = []

    for ref_type in refactoring_types:
        subfolder_path = os.path.join(data_folder, ref_type)
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.md'):
                file_path = os.path.join(subfolder_path, filename)
                with open(file_path, 'r') as file:
                    content = file.read()
                    documents.append(content)
                    labels.append(ref_type)
    return documents, labels, refactoring_types

def extract_features(documents):
    # Fit the CountVectorizer to all documents to create a global vocabulary
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform(documents).toarray()
    return features, vectorizer

data_folder = "dataset_clean\\"
documents, labels, refactoring_types = load_data_and_labels(data_folder)
features, vectorizer = extract_features(documents)

# Convert labels to a binary format for each refactoring type
classifiers = {}
for ref_type in refactoring_types:
    binary_labels = [1 if label == ref_type else 0 for label in labels]
    X_train, X_test, y_train, y_test = train_test_split(features, binary_labels, test_size=0.3, random_state=42)
    
    # Train a RandomForest classifier
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    classifiers[ref_type] = clf
    
    # Evaluate the classifier
    predictions = clf.predict(X_test)
    print(f"Classification report for {ref_type}:")
    print(classification_report(y_test, predictions))

# Function to classify new scripts
def classify_new_script(script_path, ref_type, vectorizer):
    with open(script_path, 'r') as file:
        content = file.read()
    file_features = vectorizer.transform([content]).toarray()
    clf = classifiers[ref_type]
    prediction = clf.predict(file_features)
    return bool(prediction[0])

# Example usage
new_script_path = "dataset\\Extract Method_mixed\\1107345_diff.md"  # This should be replaced by the actual path you want to use.
for ref_type in refactoring_types:
    result = classify_new_script(new_script_path, ref_type, vectorizer)
    print(f"Is the new script {ref_type}? {result}")

Classification report for Extract Interface_mixed:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        60
           1       0.00      0.00      0.00         0

    accuracy                           0.98        60
   macro avg       0.50      0.49      0.50        60
weighted avg       1.00      0.98      0.99        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Extract Interface_single:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

Classification report for Extract Method_mixed:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96        58
           1       0.00      0.00      0.00         2

    accuracy                           0.92        60
   macro avg       0.48      0.47      0.48        60
weighted avg       0.93      0.92      0.92        60

Classification report for Extract Method_single:
              precision    recall  f1-score   support

           0       0.81      0.41      0.54        32
           1       0.57      0.89      0.69        28

    accuracy                           0.63        60
   macro avg       0.69      0.65

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Extract Superclass_single:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Inline Method_mixed:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Inline Method_single:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        54
           1       0.00      0.00      0.00         6

    accuracy                           0.90        60
   macro avg       0.45      0.50      0.47        60
weighted avg       0.81      0.90      0.85        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Move Attribute_mixed:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        59
           1       0.00      0.00      0.00         1

    accuracy                           0.97        60
   macro avg       0.49      0.49      0.49        60
weighted avg       0.97      0.97      0.97        60

Classification report for Move Attribute_single:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        58
           1       0.00      0.00      0.00         2

    accuracy                           0.97        60
   macro avg       0.48      0.50      0.49        60
weighted avg       0.93      0.97      0.95        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Move Class_mixed:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        58
           1       0.00      0.00      0.00         2

    accuracy                           0.93        60
   macro avg       0.48      0.48      0.48        60
weighted avg       0.93      0.93      0.93        60

Classification report for Move Class_single:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        52
           1       0.00      0.00      0.00         8

    accuracy                           0.87        60
   macro avg       0.43      0.50      0.46        60
weighted avg       0.75      0.87      0.80        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Move Method_mixed:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        58
           1       0.00      0.00      0.00         2

    accuracy                           0.93        60
   macro avg       0.48      0.48      0.48        60
weighted avg       0.93      0.93      0.93        60

Classification report for Move Method_single:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Pull Up Method_mixed:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

Classification report for Pull Up Method_single:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        58
           1       0.00      0.00      0.00         2

    accuracy                           0.97        60
   macro avg       0.48      0.50      0.49        60
weighted avg       0.93      0.97      0.95        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Rename Package_mixed:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for Rename Package_single:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       0.00      0.00      0.00         1

    accuracy                           0.98        60
   macro avg       0.49      0.50      0.50        60
weighted avg       0.97      0.98      0.98        60

Is the new script Extract Interface_mixed? False
Is the new script Extract Interface_single? False
Is the new script Extract Method_mixed? True
Is the new script Extract Method_single? False
Is the new script Extract Superclass_mixed? False
Is the new script Extract Superclass_single? False


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Is the new script Inline Method_mixed? False
Is the new script Inline Method_single? False
Is the new script Move Attribute_mixed? False
Is the new script Move Attribute_single? False
Is the new script Move Class_mixed? False
Is the new script Move Class_single? False
Is the new script Move Method_mixed? False
Is the new script Move Method_single? False
Is the new script Pull Up Method_mixed? False
Is the new script Pull Up Method_single? False
Is the new script Rename Package_mixed? False
Is the new script Rename Package_single? False


In [22]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def load_data_and_labels(data_folder, refactoring_details_path):
    ref_details = pd.read_csv(refactoring_details_path)
    documents = []
    labels = []

    # Load documents and initially label them based on folder name
    for ref_type in os.listdir(data_folder):
        subfolder_path = os.path.join(data_folder, ref_type)
        for filename in os.listdir(subfolder_path):
            file_id = filename.split('_')[0]
            if filename.endswith('.md'):
                file_path = os.path.join(subfolder_path, filename)
                with open(file_path, 'r') as file:
                    content = file.read()
                    documents.append(content)
                    # Determine label from CSV, 1 if positive, 0 if negative for the ref_type without '_single' or '_mixed'
                    ref_type_clean = ref_type.split('_')[0]
                    is_positive = float(ref_details.loc[ref_details['id'] == int(file_id), ref_type_clean]) > 0
                    labels.append(is_positive)
    return documents, labels

def extract_features(documents):
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform(documents).toarray()
    return features, vectorizer

data_folder = "dataset_clean\\"
refactoring_details_path = 'analysis_results\\refactoring_details_post.csv'
documents, labels = load_data_and_labels(data_folder, refactoring_details_path)
features, vectorizer = extract_features(documents)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, predictions))

def classify_new_script(script_path, vectorizer):
    with open(script_path, 'r') as file:
        content = file.read()
    file_features = vectorizer.transform([content]).toarray()
    prediction = clf.predict(file_features)
    return bool(prediction[0])

new_script_path = "dataset\\Extract Method_mixed\\1107345_diff.md"  # Update with actual path to new script
result = classify_new_script(new_script_path, vectorizer)
print(f"Is the new script a specified refactoring type? {result}")

  is_positive = float(ref_details.loc[ref_details['id'] == int(file_id), ref_type_clean]) > 0


Classification Report:
              precision    recall  f1-score   support

        True       1.00      1.00      1.00        60

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

Is the new script a specified refactoring type? True


In [23]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def load_data(data_folder, refactoring_details_path):
    ref_details = pd.read_csv(refactoring_details_path)
    documents = []
    training_labels = {}
    testing_labels = {}
    testing_documents = []

    # Load training data
    for ref_type in os.listdir(data_folder):
        if '_single' in ref_type:
            ref_type_clean = ref_type.replace('_single', '')
            training_labels[ref_type_clean] = []
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        documents.append(content)
                        training_labels[ref_type_clean].append(1)  # All files in _single are true examples

    # Load testing data
    for ref_type in os.listdir(data_folder):
        if '_mixed' in ref_type:
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_id = filename.split('_')[0]
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        testing_documents.append((file_id, content))
    
    # Assign labels based on CSV for testing data
    for file_id, content in testing_documents:
        for refactoring_type in training_labels.keys():
            count = ref_details.loc[ref_details['id'] == int(file_id), refactoring_type].values[0]
            if count > 1:
                if refactoring_type not in testing_labels:
                    testing_labels[refactoring_type] = []
                testing_labels[refactoring_type].append((content, 1))
            elif count <= 1:
                if refactoring_type not in testing_labels:
                    testing_labels[refactoring_type] = []
                testing_labels[refactoring_type].append((content, 0))
    
    return documents, training_labels, testing_labels

def train_and_evaluate(documents, training_labels, testing_labels):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(documents)
    
    classifiers = {}
    results = {}
    
    for refactoring_type, labels in training_labels.items():
        y_train = labels
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        classifiers[refactoring_type] = clf

        # Prepare testing data
        X_test = [vectorizer.transform([doc]) for doc, _ in testing_labels[refactoring_type]]
        y_test = [label for _, label in testing_labels[refactoring_type]]
        X_test = np.vstack(X_test)

        # Evaluate classifier
        predictions = clf.predict(X_test)
        report = classification_report(y_test, predictions, target_names=['False', 'True'])
        results[refactoring_type] = report

    return classifiers, results

data_folder = "dataset_clean\\"
refactoring_details_path = 'analysis_results\\refactoring_details_post.csv'
documents, training_labels, testing_labels = load_data(data_folder, refactoring_details_path)
classifiers, results = train_and_evaluate(documents, training_labels, testing_labels)

# Display results
for refactoring_type, report in results.items():
    print(f"Classification report for {refactoring_type}:")
    print(report)

ValueError: Found input variables with inconsistent numbers of samples: [157, 2]

In [11]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def load_data(data_folder, refactoring_details_path):
    ref_details = pd.read_csv(refactoring_details_path)
    documents = []
    training_labels = {}
    testing_labels = {}
    testing_documents = []

    # Load training data
    for ref_type in os.listdir(data_folder):
        if '_single' in ref_type:
            ref_type_clean = ref_type.replace('_single', '')
            training_labels[ref_type_clean] = []
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        documents.append(content)
                        training_labels[ref_type_clean].append(1)  # All files in _single are true examples

    # Load testing data
    for ref_type in os.listdir(data_folder):
        if '_mixed' in ref_type:
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_id = filename.split('_')[0]
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        testing_documents.append((file_id, content))
    
    # Assign labels based on CSV for testing data
    for file_id, content in testing_documents:
        for refactoring_type in training_labels.keys():
            count = ref_details.loc[ref_details['id'] == int(file_id), refactoring_type].values[0]
            if count > 1:
                if refactoring_type not in testing_labels:
                    testing_labels[refactoring_type] = []
                testing_labels[refactoring_type].append((content, 1))
            elif count <= 1:
                if refactoring_type not in testing_labels:
                    testing_labels[refactoring_type] = []
                testing_labels[refactoring_type].append((content, 0))
    
    return documents, training_labels, testing_labels

def train_and_evaluate(documents, training_labels, testing_labels):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(documents)
    
    classifiers = {}
    results = {}
    
    for refactoring_type, labels in training_labels.items():
        y_train = labels
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        classifiers[refactoring_type] = clf

        # Prepare testing data
        X_test = [vectorizer.transform([doc]) for doc, _ in testing_labels[refactoring_type]]
        y_test = [label for _, label in testing_labels[refactoring_type]]
        X_test = np.vstack(X_test)

        # Evaluate classifier
        predictions = clf.predict(X_test)
        report = classification_report(y_test, predictions, target_names=['False', 'True'])
        results[refactoring_type] = report

    return classifiers, results

data_folder = "dataset_clean\\"
refactoring_details_path = 'analysis_results\\refactoring_details_post.csv'
documents, training_labels, testing_labels = load_data(data_folder, refactoring_details_path)
print('labels')
print(training_labels)
classifiers, results = train_and_evaluate(documents, training_labels, testing_labels)

# Display results
for refactoring_type, report in results.items():
    print(f"Classification report for {refactoring_type}:")
    print(report)

labels
{'Extract Interface': [1, 1], 'Extract Method': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'Extract Superclass': [1, 1, 1, 1, 1], 'Inline Method': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'Move Attribute': [1, 1, 1, 1, 1, 1, 1], 'Move Class': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'Move Method': [1, 1, 1, 1, 1], 'Pull Up Method': [1, 1, 1, 1], 'Rename Package': [1, 1, 1, 1]}


In [15]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def load_data(data_folder, refactoring_details_path):
    ref_details = pd.read_csv(refactoring_details_path)
    train_documents = []
    test_documents = []
    train_ids = []
    test_ids = []
    training_labels = {}
    testing_labels = {}

    # Load training data from _single directories
    for ref_type in os.listdir(data_folder):
        if '_single' in ref_type:
            ref_type_clean = ref_type.replace('_single', '')
            training_labels[ref_type_clean] = []
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    file_id = filename.split('_')[0]
                    with open(file_path, 'r') as file:
                        content = file.read()
                        train_documents.append(content)
                        train_ids.append(file_id)
                        # Initialize labels for all refactoring types
                        for key in training_labels:
                            training_labels[key].append(0)
                        training_labels[ref_type_clean][-1] = 1  # Set the correct label for this refactoring type

    # Load testing data from _mixed directories
    for ref_type in os.listdir(data_folder):
        if '_mixed' in ref_type:
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    file_id = filename.split('_')[0]
                    with open(file_path, 'r') as file:
                        content = file.read()
                        test_documents.append(content)
                        test_ids.append(file_id)

    # Assign labels for testing based on CSV data
    for idx, doc in enumerate(test_documents):
        file_id = test_ids[idx]
        for refactoring_type in training_labels.keys():
            count = ref_details.loc[ref_details['id'] == int(file_id), refactoring_type].values[0]
            if refactoring_type not in testing_labels:
                testing_labels[refactoring_type] = []
            testing_labels[refactoring_type].append(1 if count > 1 else 0)
    
    return train_documents, train_ids, test_documents, test_ids, training_labels, testing_labels

def train_and_evaluate(train_docs, train_labels, test_docs, test_labels):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(train_docs)
    X_test = vectorizer.transform(test_docs)
    
    classifiers = {}
    results = {}
    
    for refactoring_type, labels in train_labels.items():
        clf = RandomForestClassifier()
        clf.fit(X_train, labels)
        classifiers[refactoring_type] = clf

        # Evaluate classifier
        predictions = clf.predict(X_test)
        report = classification_report(test_labels[refactoring_type], predictions, target_names=['False', 'True'])
        results[refactoring_type] = report

    return classifiers, results

data_folder = "dataset_clean\\"
refactoring_details_path = 'analysis_results\\refactoring_details_post.csv'
train_docs, train_ids, test_docs, test_ids, train_labels, test_labels = load_data(data_folder, refactoring_details_path)
print(train_labels)
print(test_labels)
classifiers, results = train_and_evaluate(train_docs, train_labels, test_docs, test_labels)

# Display results
for refactoring_type, report in results.items():
    print(f"Classification report for {refactoring_type}:")
    print(report)

{'Extract Interface': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'Extract Method': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'Extract Superclass'

ValueError: Number of classes, 1, does not match size of target_names, 2. Try specifying the labels parameter

In [24]:
from sklearn.metrics import confusion_matrix
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def load_data(data_folder, refactoring_details_path):
    ref_details = pd.read_csv(refactoring_details_path)
    train_documents = []
    test_documents = []
    train_ids = []
    test_ids = []
    training_labels = {}
    testing_labels = {}

    # Initialize training labels for all refactoring types
    ref_types = [d.replace('_single', '') for d in os.listdir(data_folder) if '_single' in d]
    for ref_type in ref_types:
        training_labels[ref_type] = []
        testing_labels[ref_type] = []

    # Load training data from _single directories and assign labels
    for ref_type in os.listdir(data_folder):
        if '_single' in ref_type:
            ref_type_clean = ref_type.replace('_single', '')
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        train_documents.append(content)
                        train_ids.append(filename.split('_')[0])
                        # Initialize labels for all refactoring types as 0
                        for key in training_labels:
                            training_labels[key].append(0)
                        # Set the current refactoring type label to 1
                        training_labels[ref_type_clean][-1] = 1

    # Load testing data from _mixed directories and assign labels based on CSV
    for ref_type in os.listdir(data_folder):
        if '_mixed' in ref_type:
            ref_type_clean = ref_type.replace('_mixed', '')
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        test_documents.append(content)
                        test_ids.append(filename.split('_')[0])
                        file_id = filename.split('_')[0]
                        # Initialize labels for all refactoring types as 0
                        for key in testing_labels:
                            testing_labels[key].append(0)
                        # Set labels based on occurrence counts from the CSV
                        for key in ref_types:
                            count = ref_details.loc[ref_details['id'] == int(file_id), key].values[0]
                            if count > 0:
                                testing_labels[key][-1] = 1

    return train_documents, train_ids, test_documents, test_ids, training_labels, testing_labels

def train_and_evaluate(train_docs, train_labels, test_docs, test_labels):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(train_docs)
    X_test = vectorizer.transform(test_docs)
    
    classifiers = {}
    results = {}
    
    for refactoring_type, labels in train_labels.items():
        clf = RandomForestClassifier()
        clf.fit(X_train, labels)
        classifiers[refactoring_type] = clf

        # Evaluate classifier
        predictions = clf.predict(X_test)
        actual_labels = np.array(test_labels[refactoring_type])
        
        # Handle cases where only one class is present in the test labels
        if len(set(actual_labels)) == 1:
            print(f"Only one class present for {refactoring_type}. Cannot generate a full classification report.")
            cm = confusion_matrix(actual_labels, predictions, labels=[0, 1])
            accuracy = np.mean(predictions == actual_labels)
            results[refactoring_type] = {'confusion_matrix': cm, 'accuracy': accuracy}
        elif len(set(predictions)) == 1 and 1 not in predictions:
            print(f"No positive predictions for {refactoring_type}. All predictions are negative.")
            report = classification_report(test_labels[refactoring_type], predictions, target_names=['False', 'True'], zero_division=0)
            results[refactoring_type] = report
        else:
            report = classification_report(actual_labels, predictions, target_names=['False', 'True'])
            results[refactoring_type] = report

    return classifiers, results

data_folder = "dataset_clean\\"
refactoring_details_path = 'analysis_results\\refactoring_details_post.csv'
train_docs, train_ids, test_docs, test_ids, train_labels, test_labels = load_data(data_folder, refactoring_details_path)
classifiers, results = train_and_evaluate(train_docs, train_labels, test_docs, test_labels)

# Display results
for refactoring_type, result in results.items():
    print(f"Results for {refactoring_type}:")
    if isinstance(result, dict) and 'confusion_matrix' in result:
        print("Confusion Matrix:")
        print(result['confusion_matrix'])
        print("Accuracy:", result['accuracy'])
    else:
        print(result)

No positive predictions for Extract Interface. All predictions are negative.
No positive predictions for Extract Superclass. All predictions are negative.
No positive predictions for Inline Method. All predictions are negative.
No positive predictions for Move Attribute. All predictions are negative.
No positive predictions for Move Class. All predictions are negative.
No positive predictions for Move Method. All predictions are negative.
No positive predictions for Pull Up Method. All predictions are negative.
No positive predictions for Rename Package. All predictions are negative.
Results for Extract Interface:
              precision    recall  f1-score   support

       False       0.88      1.00      0.94        37
        True       0.00      0.00      0.00         5

    accuracy                           0.88        42
   macro avg       0.44      0.50      0.47        42
weighted avg       0.78      0.88      0.83        42

Results for Extract Method:
              precision

In [25]:
from sklearn.metrics import confusion_matrix
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import confusion_matrix, accuracy_score

def load_data(data_folder, refactoring_details_path):
    ref_details = pd.read_csv(refactoring_details_path)
    train_documents = []
    test_documents = []
    train_ids = []
    test_ids = []
    training_labels = {}
    testing_labels = {}

    # Initialize training labels for all refactoring types
    ref_types = [d.replace('_single', '') for d in os.listdir(data_folder) if '_single' in d]
    for ref_type in ref_types:
        training_labels[ref_type] = []
        testing_labels[ref_type] = []

    # Load training data from _single directories and assign labels
    for ref_type in os.listdir(data_folder):
        if '_single' in ref_type:
            ref_type_clean = ref_type.replace('_single', '')
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        train_documents.append(content)
                        train_ids.append(filename.split('_')[0])
                        # Initialize labels for all refactoring types as 0
                        for key in training_labels:
                            training_labels[key].append(0)
                        # Set the current refactoring type label to 1
                        training_labels[ref_type_clean][-1] = 1

    # Load testing data from _mixed directories and assign labels based on CSV
    for ref_type in os.listdir(data_folder):
        if '_mixed' in ref_type:
            ref_type_clean = ref_type.replace('_mixed', '')
            subfolder_path = os.path.join(data_folder, ref_type)
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.md'):
                    file_path = os.path.join(subfolder_path, filename)
                    with open(file_path, 'r') as file:
                        content = file.read()
                        test_documents.append(content)
                        test_ids.append(filename.split('_')[0])
                        file_id = filename.split('_')[0]
                        # Initialize labels for all refactoring types as 0
                        for key in testing_labels:
                            testing_labels[key].append(0)
                        # Set labels based on occurrence counts from the CSV
                        for key in ref_types:
                            count = ref_details.loc[ref_details['id'] == int(file_id), key].values[0]
                            if count > 0:
                                testing_labels[key][-1] = 1

    return train_documents, train_ids, test_documents, test_ids, training_labels, testing_labels

def train_and_evaluate(train_docs, train_labels, test_docs, test_labels, path_save='analysis_results\\'):
    vectorizer=CountVectorizer()
    X_train = vectorizer.fit_transform(train_docs)
    X_test = vectorizer.transform(test_docs)
    classifiers = {}
    results = {}
    summary = {}
    
    for refactoring_type, labels in train_labels.items():
        clf = RandomForestClassifier()
        clf.fit(X_train, labels)
        classifiers[refactoring_type] = clf

        # Evaluate classifier
        predictions = clf.predict(X_test)
        cm = confusion_matrix(test_labels[refactoring_type], predictions)
        acc = accuracy_score(test_labels[refactoring_type], predictions)
        
        # Store results
        results[refactoring_type] = {
            'confusion_matrix': cm,
            'accuracy': acc
        }
        summary[refactoring_type] = [acc, *cm.ravel()]  # Flatten confusion matrix and prepend accuracy

    # Serialize results and summary
    with open(path_save + 'classification_results.pkl', 'wb') as f:
        pickle.dump(results, f)
    with open(path_save + 'classification_summary.pkl', 'wb') as f:
        pickle.dump(summary, f)
    
    return classifiers, results

data_folder = "dataset_clean\\"
refactoring_details_path = 'analysis_results\\refactoring_details_post.csv'
train_docs, train_ids, test_docs, test_ids, train_labels, test_labels = load_data(data_folder, refactoring_details_path)
classifiers, results = train_and_evaluate(train_docs, train_labels, test_docs, test_labels)

# Display results
for refactoring_type, result in results.items():
    print(f"Results for {refactoring_type}:")
    if isinstance(result, dict) and 'confusion_matrix' in result:
        print("Confusion Matrix:")
        print(result['confusion_matrix'])
        print("Accuracy:", result['accuracy'])
    else:
        print(result)

Results for Extract Interface:
Confusion Matrix:
[[37  0]
 [ 5  0]]
Accuracy: 0.8809523809523809
Results for Extract Method:
Confusion Matrix:
[[ 7 15]
 [ 8 12]]
Accuracy: 0.4523809523809524
Results for Extract Superclass:
Confusion Matrix:
[[36  0]
 [ 6  0]]
Accuracy: 0.8571428571428571
Results for Inline Method:
Confusion Matrix:
[[37  0]
 [ 5  0]]
Accuracy: 0.8809523809523809
Results for Move Attribute:
Confusion Matrix:
[[31  0]
 [11  0]]
Accuracy: 0.7380952380952381
Results for Move Class:
Confusion Matrix:
[[23  0]
 [19  0]]
Accuracy: 0.5476190476190477
Results for Move Method:
Confusion Matrix:
[[31  0]
 [11  0]]
Accuracy: 0.7380952380952381
Results for Pull Up Method:
Confusion Matrix:
[[36  0]
 [ 6  0]]
Accuracy: 0.8571428571428571
Results for Rename Package:
Confusion Matrix:
[[35  0]
 [ 7  0]]
Accuracy: 0.8333333333333334
