In [4]:
import pandas as pd

# Convert CSV files into dataframes to extract methods from
methods1 = pd.read_csv('/content/extracted_methods__ether_pad.csv', encoding='latin-1')
methods2 = pd.read_csv('/content/extracted_methods__haraldk_twelvemonkeys.csv', encoding='latin-1')
methods3 = pd.read_csv('/content/extracted_methods__sirthias_parboiled.csv', encoding='latin-1')
methods4 = pd.read_csv('/content/extracted_methods__tinkerpop_blueprints.csv', encoding='latin-1')
methods5 = pd.read_csv('/content/extracted_methods__hierynomus_sshj.csv', encoding='latin-1')

# Combine dataframes into one
methods = pd.concat([methods1, methods2], ignore_index=True)
methods = pd.concat([methods, methods3], ignore_index=True)
methods = pd.concat([methods, methods4], ignore_index=True)
methods = pd.concat([methods, methods5], ignore_index=True)

In [5]:
# Code taken from Pre-processing lab notebook - refines data into usable methods
def remove_duplicates(data):
    """Remove duplicate methods based on method content.
    Almost Type-1 with the exception of comments
    """
    return data.drop_duplicates(subset="Method Java", keep="first")

def filter_ascii_methods(data):
    """Filter methods to include only those with ASCII characters."""
    data = data[data["Method Java"].apply(lambda x: all(ord(char) < 128 for char in x))]
    return data

def remove_outliers(data, lower_percentile=5, upper_percentile=95):
    """Remove outliers based on method length."""
    method_lengths = data["Method Java"].apply(len)
    lower_bound = method_lengths.quantile(lower_percentile / 100)
    upper_bound = method_lengths.quantile(upper_percentile / 100)
    return data[(method_lengths >= lower_bound) & (method_lengths <= upper_bound)]

In [6]:
from pygments.lexers.jvm import JavaLexer
from pygments.lexers import get_lexer_by_name
from pygments.token import Token
import re
import random

In [7]:
# Code taken from Pre-processing lab notebook - refines data into usable methods
def remove_boilerplate_methods(data):
    """Remove boilerplate methods like setters and getters."""
    boilerplate_patterns = [
        r"\bset[A-Z][a-zA-Z0-9_]*\(.*\)\s*{",  # Setter methods
        r"\bget[A-Z][a-zA-Z0-9_]*\(.*\)\s*{",  # Getter methods
    ]
    boilerplate_regex = re.compile("|".join(boilerplate_patterns))
    data = data[~data["Method Java"].apply(lambda x: bool(boilerplate_regex.search(x)))]
    return data

def remove_comments_from_dataframe(df: pd.DataFrame, method_column: str, language: str) -> pd.DataFrame:
    """
    Removes comments from Java methods in a DataFrame and adds a new column with cleaned methods.

    Args:
        df (pd.DataFrame): DataFrame containing the methods.
        method_column (str): Column name containing the raw Java methods.
        language (str): Programming language for the lexer (e.g., 'java').

    Returns:
        pd.DataFrame: Updated DataFrame with a new column 'Java Method No Comments'.
    """
    # Define a function to remove comments from a single method
    def remove_comments(code):
        lexer = get_lexer_by_name(language)
        tokens = lexer.get_tokens(code)
        # Filter out comments using a lambda function
        clean_code = ''.join(token[1] for token in tokens if not (lambda t: t[0] in Token.Comment)(token))
        return clean_code

    # Apply the function to the specified column and add a new column with the results
    df["Method Java No Comments"] = df[method_column].apply(remove_comments)
    return df

In [8]:
# Code taken from Pre-processing lab notebook - refines data into usable methods
data = methods
data = data.drop("Commit Hash", axis=1)
data = data.drop("File Name", axis=1)
data = data.drop("Method Name", axis=1)
data = data.drop("Commit Link", axis=1)

print("Initial dataset size:", len(data))
data = remove_duplicates(data)
print("After removing duplicates:", len(data))

data = filter_ascii_methods(data)
print("After filtering ASCII methods:", len(data))

data = remove_outliers(data)
print("After removing outliers:", len(data))

data = remove_boilerplate_methods(data)
print("After removing boilerplate methods:", len(data))

data = remove_comments_from_dataframe(data, "Method Java", "Java")
print("After cleaning comments:", len(data))

# Create training/eval/test sets for student dataset
student_methods = list(data["Method Java No Comments"])
# Randomize method order to avoid repository bias in sets
random.Random(42).shuffle(student_methods)
# These slice values correspond to an 80-10-10 set split
training_student_methods = student_methods[:int(len(student_methods) * 0.8)]
eval_student_methods = student_methods[int(len(student_methods) * 0.8 + 1):int(len(student_methods) * 0.9)]
test_student_methods = student_methods[int(len(student_methods) * 0.9):]

Initial dataset size: 228174
After removing duplicates: 42516
After filtering ASCII methods: 42432
After removing outliers: 38339
After removing boilerplate methods: 29373
After cleaning comments: 29373


In [9]:
"""
Tokenizes all methods within a given dataframe

Args:
data (dataframe): Dataframe containing methods to tokenize

Returns: A list of extracted tokens
"""
def tokenize(data):
    lexer = JavaLexer(encoding='latin-1')
    lexer.add_filter('whitespace', spaces=' ')
    datas = data["Method Java No Comments"].to_list()
    datas = ' '.join(datas)
    tokens = [t[1] for t in lexer.get_tokens(datas) if t[1] != ' ' and t[1] != '\n' and t[1] != '\t']
    return tokens

"""
Tokenizes a given method

Args:
method (str): Method to tokenize

Returns: A list of extracted tokens
"""
def tokenize_method(method):
    lexer = JavaLexer(encoding='latin-1')
    lexer.add_filter('whitespace', spaces=' ')
    tokens = [t[1] for t in lexer.get_tokens(method) if t[1] != ' ' and t[1] != '\n' and t[1] != '\t']
    return tokens

In [10]:
# nltk and collections modules are used for creating ngrams
import nltk
from nltk import ngrams
from collections import defaultdict

In [11]:
"""
Creates ngrams for a given set of tokens

Args:
tokens (list of str): Tokenized data
n (int): Window size

Returns: Model containing n-grams and probabilities of next token
"""
# CODE DERIVED FROM https://www.geeksforgeeks.org/n-gram-language-modelling-with-nltk
def create_ngrams(tokens, n):
  n_gram = ngrams(tokens, n)
  # Model will contain all ngrams along with the
  # potentially predictable tokens and their probabilities
  model_n = defaultdict(lambda: defaultdict(lambda: 0))
  # General form to increment the count of a given predictable token
  for gram in n_gram:
      model_n[gram[:-1]][gram[-1]] += 1

  # Transform the counts into probabilities
  for gram in model_n:
      total_count = float(sum(model_n[gram].values()))
      for pred_token in model_n[gram]:
          model_n[gram][pred_token] /= total_count

  return model_n

In [12]:
"""
Predicts the next word based on the previous two words using the trained 5-gram model.
Args:
model (dictionary): Model with all known n-grams and their probabilities
gram (tuple): Tokens to use to predict next token

Returns: Tuple containing predicted next token and its probability.
"""
# CODE DERIVED FROM https://www.geeksforgeeks.org/n-gram-language-modelling-with-nltk
def predict_next_word(model, gram):
    next_word = model[gram]
    if next_word:
        # Choose the most likely next word
        predicted_word = max(next_word, key=next_word.get)
        predicted_prob = next_word[predicted_word]
        return (predicted_word, predicted_prob)
    # If no next valid word can be predicted, return an UNK token
    else:
        return ("<UNK>", 1)

"""
Iteratively creates a method based on predictions from given tokens

Args:
gram (tuple): First n-1 tokens to predict method with n-gram
tokens (list): Tokens extracted from methods
model_train (ngrams): The ngrams of the given training data
include_probability (boolean): Determines whether to include probability of predicted token

Returns: List of tokens (str) in predicted method
(or tuple containing tokens (str) of method and their probabilities (str))
"""
def iterative_prediction(gram, tokens, model_train, include_probability=True):
    # Counts number of predictions in the method so far
    token_count = 0
    context_size = len(gram) + 1
    # Converts the n-gram to an iterable form (for unpacking in method args)
    symbols = [gram]
    symbols_iterable = list(*symbols)
    # Stores what will become the finished predicted method
    predicted_method = []
    next_word = predict_next_word(model_train, *symbols)
    brackets = 0
    # If an opening bracket is present in the original tokens, need to account
    # for it if we want to attempt to balance delimiters
    for symbol in gram:
        if symbol == '{':
            brackets += 1
    while token_count < 100:
        # Sanity check to prevent potentially endless loops
        token_count += 1
        # Attempt to count delimiters for balancing
        if next_word[0] == '{':
            brackets += 1
        elif next_word[0] == '}':
            brackets -= 1
        # Check whether to include probability in returned value
        if include_probability:
            predicted_method += [next_word]
        else:
            predicted_method += [next_word[0]]
        # Check if delimiters are balanced - if they are, stop iterating
        if next_word[0] == '}' and brackets == 0:
            break
        # Accounts for abstract methods, e.g.
        if next_word[0] == ';' and brackets == 0:
            break
        # Shift tokens over to include new predicted token for next iteration
        for i in range(len(symbols_iterable) - 1):
            symbols_iterable[i] = symbols_iterable[i + 1]
        symbols_iterable[len(symbols_iterable) - 1] = next_word[0]
        symbols = tuple(symbols_iterable)
        next_word = predict_next_word(model_train, symbols)

    return predicted_method

In [13]:
import math
"""
Calculates the perplexity of a given method prediction

Args:
prediction (list): The generated method from an iterative prediction

Returns: The perplexity of a given method prediction (float)
"""
def perplexity(prediction):
    # Check if perplexity can be calculated. If not, won't calculate
    if len(prediction) == 0:
        return "Unknown"
    elif isinstance(prediction[0], tuple):
        perplexity_score = 1
        for t in prediction:
            perplexity_score *= 1/t[1]
        perplexity_score = math.pow(perplexity_score, 1/len(prediction))
        return perplexity_score

In [14]:
"""
Calculates the n-gram model's perplexity for a given evaluation set

Args:
dataset (list): List of holdout evaluation/testing methods obtained from dataset
tokens (list): List of tokens/vocabulary
model_train (ngrams): N-grams of given data
context_size (int): Size of n-gram context
"""
def evaluate_model_perplexity(dataset, tokens, model_train, context_size):
    valid_methods = 0
    perplexity_sum = 0
    for method in dataset:
        method = tokenize_method(method)
        first_n_tokens = method[0:context_size-1]
        #print("Original method:\n" + str(' '.join(method)))
        #pred = ' '.join(iterative_prediction(tuple(first_n_tokens), teacher_tokens, model_train, include_probability=False))
        #print(' '.join(first_n_tokens) + " " + pred)
        pred_verbose = iterative_prediction(tuple(first_n_tokens), tokens, model_train, include_probability=True)
        #print(pred_verbose)
        method_perplexity = perplexity(pred_verbose)
        if method_perplexity != "Unknown" and pred_verbose[0][0] != "<UNK>":
            valid_methods += 1
            perplexity_sum += method_perplexity
        #print("Method Perplexity: " + str(perplexity(pred_verbose)))
        #print('\n')
    average_perplexity = perplexity_sum/valid_methods
    print(f'AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE {context}: {average_perplexity}')

In [15]:
# Begin curating dataset for instructor-provided corpus
method_file = 'training.txt'

teacher_methods = list(open(method_file).read().splitlines())

#Randomize methods for train/test/validation split
random.Random(50).shuffle(teacher_methods)
#80-10-10 split
training_teacher_methods = teacher_methods[:80001]
eval_teacher_methods = teacher_methods[80001:90001]
test_teacher_methods = teacher_methods[90001:]

In [16]:
# Tokenize teacher dataset
teacher_tokens = []
for method in training_teacher_methods:
    teacher_tokens += tokenize_method(method)

In [17]:
# Tokenize student dataset
student_tokens = []
for method in training_student_methods:
    student_tokens += tokenize_method(method)

In [18]:
# Size of student vocabulary
print(len(set(student_tokens)))

27517


In [19]:
#Finds and displays average perplexity of n-gram models for n=3, n=5, n=7, n=9
context = 3
model_train = create_ngrams(student_tokens, context)
evaluate_model_perplexity(eval_student_methods, student_tokens, model_train, context)

AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE 3: 3.4814875263359037


In [20]:
context = 5
model_train = create_ngrams(student_tokens, context)
evaluate_model_perplexity(eval_student_methods, student_tokens, model_train, context)

AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE 5: 1.6179664078909373


In [21]:
context = 7
model_train = create_ngrams(student_tokens, context)
evaluate_model_perplexity(eval_student_methods, student_tokens, model_train, context)

AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE 7: 1.2561000162692226


In [29]:
# Testing best-performing student model on test set
optimal_student_context = 7
model_train = create_ngrams(student_tokens, optimal_student_context)
evaluate_model_perplexity(test_student_methods, student_tokens, model_train, optimal_student_context)

AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE 7: 1.2497774207404262


In [30]:
# WRITES OPTIMAL MODEL'S TEST METHOD OUTPUT TO A .TXT FILE; NOT NECESSARY TO RUN
# If running, make sure the previous block has been run before it, or model_train will be incorrect
"""
i = 0
dict_to_dump = dict()
for method in test_student_methods[:100]:
    method = tokenize_method(method)
    dict_to_dump[str(i)] = str(iterative_prediction(tuple(method[0:optimal_student_context-1]), student_tokens, model_train, include_probability=True))
    i += 1

out_file = open("results_student_model.txt", "w")
for i in range(0, len(dict_to_dump)):
    out_file.write('"' + str(i) + '": ' + str(dict_to_dump[str(i)]) + '\n\n')
out_file.close()
"""

In [23]:
# Finds and displays average perplexity of n-gram models for n=3, n=5, n=7, n=9
# WARNING: This code block takes around 10 minutes to fully execute
context = 3
model_train = create_ngrams(teacher_tokens, context)
evaluate_model_perplexity(eval_teacher_methods, teacher_tokens, model_train, context)

AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE 3: 1.762809392841246


In [24]:
context = 5
model_train = create_ngrams(teacher_tokens, context)
evaluate_model_perplexity(eval_teacher_methods, teacher_tokens, model_train, context)

AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE 5: 1.965038245542253


In [25]:
context = 7
model_train = create_ngrams(teacher_tokens, context)
evaluate_model_perplexity(eval_teacher_methods, teacher_tokens, model_train, context)

AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE 7: 1.8768182841564058


In [26]:
# Testing best-performing instructor-provided model on test set
optimal_teacher_context = 3
model_train = create_ngrams(teacher_tokens, optimal_teacher_context)
evaluate_model_perplexity(test_student_methods, teacher_tokens, model_train, optimal_teacher_context)

AVERAGE PERPLEXITY OF MODEL WITH CONTEXT SIZE 7: 1.761250418620468


In [27]:
# WRITES OPTIMAL MODEL'S TEST METHOD OUTPUT TO A .TXT FILE; NOT NECESSARY TO RUN
# If running, make sure the previous block has been run before it, or model_train will be incorrect
"""
i = 0
dict_to_dump = dict()
for method in test_student_methods[:100]:
    method = tokenize_method(method)
    dict_to_dump[str(i)] = str(iterative_prediction(tuple(method[0:optimal_teacher_context-1]), teacher_tokens, model_train, include_probability=True))
    i += 1

out_file = open("results_teacher_model.txt", "w")
for i in range(0, len(dict_to_dump)):
    out_file.write('"' + str(i) + '": ' + str(dict_to_dump[str(i)]) + '\n\n')
out_file.close()
"""