In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, confusion_matrix
# import the transformers library
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from sklearn.model_selection import train_test_split
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read csv data/processed/bug_data.csv
data = pd.read_csv('data/processed/bug_data.csv')

In [3]:
data.columns

Index(['repo_name', 'issue_number', 'issue_title', 'issue_body',
       'issue_created_at', 'issue_closed_at', 'issue_comments_count',
       'issue_url', 'pr_number', 'pr_merged_at', 'pr_url',
       'config_files_changed', 'app_code_files_changed', 'other_files_changed',
       'total_files_changed', 'lines_added', 'lines_deleted',
       'config_files_lines_changed', 'app_code_files_lines_changed',
       'resolution_time_hours', 'labels', 'has_config_changes',
       'has_code_changes', 'bug_severity', 'bug_type', 'changed_files',
       'services_affected', 'is_cross_service_bug'],
      dtype='object')

In [4]:
data.head(10)

Unnamed: 0,repo_name,issue_number,issue_title,issue_body,issue_created_at,issue_closed_at,issue_comments_count,issue_url,pr_number,pr_merged_at,...,app_code_files_lines_changed,resolution_time_hours,labels,has_config_changes,has_code_changes,bug_severity,bug_type,changed_files,services_affected,is_cross_service_bug
0,GoogleCloudPlatform/microservices-demo,2873,secCompProfile without securityContext enabled...,### Describe the bug when .Values.securityCon...,2025-01-20T20:09:02+00:00,2025-01-29T22:07:54+00:00,0,https://github.com/GoogleCloudPlatform/microse...,2874,2025-01-29T22:07:53+00:00,...,0,217.980833,type: bug;priority: p2,True,False,normal,configuration,"[{""filename"": ""helm-chart/templates/adservice....",templates;templates;templates;templates;templa...,False
1,GoogleCloudPlatform/microservices-demo,2873,secCompProfile without securityContext enabled...,### Describe the bug when .Values.securityCon...,2025-01-20T20:09:02+00:00,2025-01-29T22:07:54+00:00,0,https://github.com/GoogleCloudPlatform/microse...,2874,2025-01-29T22:07:53+00:00,...,0,217.980833,type: bug;priority: p2,True,False,normal,configuration,"[{""filename"": ""helm-chart/templates/adservice....",templates;templates;templates;templates;templa...,False
2,GoogleCloudPlatform/microservices-demo,2872,securityContext opt-in not working for payment...,### Describe the bug when setting .Values.sec...,2025-01-20T20:06:25+00:00,2025-01-29T22:07:54+00:00,0,https://github.com/GoogleCloudPlatform/microse...,2874,2025-01-29T22:07:53+00:00,...,0,218.024444,type: bug;priority: p2,True,False,normal,security,"[{""filename"": ""helm-chart/templates/adservice....",templates;templates;templates;templates;templa...,False
3,aws-samples/aws-microservices-deploy-options,230,Health checks are failing Fargate deployment p...,https://github.com/aws-samples/aws-microservic...,2018-04-14T02:11:47+00:00,2018-04-16T04:04:28+00:00,5,https://github.com/aws-samples/aws-microservic...,244,2018-04-16T04:03:48+00:00,...,0,49.866944,,True,False,normal,security,"[{""filename"": ""apps/ecs/deployment/webapp.yaml...",apps,False
4,GoogleCloudPlatform/microservices-demo,2688,pods crash on aarch64,### Describe the bug <!-- A clear and concise...,2024-08-22T02:54:08+00:00,2024-08-22T19:01:52+00:00,3,https://github.com/GoogleCloudPlatform/microse...,2584,2024-06-10T11:33:19+00:00,...,0,-1743.346944,type: bug;priority: p3,True,False,critical,ui,"[{""filename"": ""src/adservice/build.gradle"", ""l...",src;src;src,False
5,GoogleCloudPlatform/microservices-demo,2677,GKE v1.29.7-gke.1174000: rpc error: code = Una...,### Describe the bug On GKE version `v1.29.7-...,2024-08-13T19:53:42+00:00,2024-08-14T21:36:31+00:00,2,https://github.com/GoogleCloudPlatform/microse...,2429,2024-03-18T01:50:38+00:00,...,0,-3570.051111,,True,False,critical,configuration,"[{""filename"": ""src/currencyservice/package-loc...",src;src;src;src,False
6,aws-samples/aws-microservices-deploy-options,146,Lambda functions are failing on AWS,Invoking the Lambda function on AWS is giving ...,2018-03-30T21:14:38+00:00,2018-03-31T00:10:05+00:00,1,https://github.com/aws-samples/aws-microservic...,152,2018-03-31T00:10:05+00:00,...,0,2.924167,lambda,True,False,normal,database,"[{""filename"": ""services/greeting/pom.xml"", ""li...",services;services;services;services,False
7,spring-petclinic/spring-petclinic-microservices,246,Docker build with the --load option,Fix Docker build by adding the --load options ...,2023-12-22T18:57:50+00:00,2023-12-23T14:50:01+00:00,0,https://github.com/spring-petclinic/spring-pet...,246,2023-12-23T14:50:01+00:00,...,0,19.869722,bug,True,False,normal,ui,"[{""filename"": ""pom.xml"", ""lines_added"": 1, ""li...",,False
8,spring-petclinic/spring-petclinic-microservices,242,Support for Docker and Podman build on mac sil...,This makes it possible to run the container bu...,2023-12-08T11:12:00+00:00,2023-12-21T07:54:56+00:00,4,https://github.com/spring-petclinic/spring-pet...,242,2023-12-21T07:54:56+00:00,...,117,308.715556,bug,True,True,normal,ui,"[{""filename"": "".mvn/wrapper/MavenWrapperDownlo...",.mvn;.mvn;.mvn;spring-petclinic-admin-server;s...,True
9,aws-samples/aws-microservices-deploy-options,53,ECS cluster creation failed using CloudFormati...,Creating an ECS cluster using CloudFormation g...,2018-03-08T21:06:14+00:00,2018-03-09T18:31:18+00:00,4,https://github.com/aws-samples/aws-microservic...,61,2018-03-09T18:06:57+00:00,...,0,21.011944,,True,False,normal,configuration,"[{""filename"": ""apps/ecs/ec2/templates/master.y...",apps;apps,False


In [5]:
# we train a model based on bert embedding of issue title and issue bogy to predict 2 things, if this issue will contain config file change, and if this issue will contain code file change

# we will use the following columns as features
# 'issue_title', 'issue_body'
# we will use the following columns as labels
# 'has_config_changes', 'has_code_changes'
feature_columns = ['issue_title', 'issue_body']
label_columns = ['has_config_changes', 'has_code_changes']


In [6]:
feature_pd = data[feature_columns]
label_pd = data[label_columns]

In [7]:
# check if there are any null values in the feature columns
feature_pd.isnull().sum()

issue_title     0
issue_body     62
dtype: int64

In [8]:
# check if there are any null values in the label columns
label_pd.isnull().sum()

has_config_changes    0
has_code_changes      0
dtype: int64

In [9]:
model_name = "bert-base-uncased"  # or any other BERT model you want to use
# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModelForMaskedLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
bert_model.eval()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [10]:
def get_bert_embedding(text_list, tokenizer, model, max_length=128):
    """
    For a list of text strings, return a numpy array of shape (N, hidden_dim),
    where hidden_dim is typically 768 for base BERT models.
    """
    embeddings = []

    for text in text_list:
        inputs = tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=max_length
        )
        for k, v in inputs.items():
            inputs[k] = v.to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            # outputs[0] -> last hidden state, shape: (batch_size, seq_len, hidden_size)
            # outputs[1] -> pooled output (if the model supports it)
            # For BERTModel, outputs[1] is sometimes not returned by default.
            # If that’s the case, we can manually pool by taking outputs[0][:, 0] (the CLS token).
            # But "answerdotai/ModernBERT-base" might return the pooled output in outputs[1].
            # Double-check if it does. If not, we do:
            last_hidden_state = outputs[0]
            cls_embedding = last_hidden_state[:, 0, :]  # [CLS] token is first
            embeddings.append(cls_embedding.cpu().numpy().flatten())

    return np.array(embeddings)

In [11]:

combined_texts = (feature_pd['issue_title'] + " " + feature_pd['issue_body']).fillna("")

embeddings = get_bert_embedding(
    text_list=combined_texts.tolist(),
    tokenizer=tokenizer,
    model=bert_model,
    max_length=128
)

In [12]:
print("Embeddings shape:", embeddings.shape)

Embeddings shape: (1637, 30522)


In [13]:
print(label_pd['has_config_changes'].value_counts())
print(label_pd['has_code_changes'].value_counts())


has_config_changes
False    1132
True      505
Name: count, dtype: int64
has_code_changes
True     1339
False     298
Name: count, dtype: int64


In [14]:
from sklearn.utils import resample

def oversample_minority(X, y, random_state=42):
    """
    Oversample the minority class in (X, y) to match the majority class size.
    Returns the oversampled X and y as NumPy arrays.
    """
    X = np.array(X)
    y = np.array(y)

    # Split majority and minority classes
    majority_class = (y == 0)
    minority_class = (y == 1)

    X_majority = X[majority_class]
    y_majority = y[majority_class]

    X_minority = X[minority_class]
    y_minority = y[minority_class]

    # If there's no imbalance, just return original
    if len(X_minority) == 0 or len(X_majority) == 0:
        print("No real imbalance detected or no minority samples found.")
        return X, y

    # Oversample minority to match the majority size
    X_minority_upsampled, y_minority_upsampled = resample(
        X_minority,
        y_minority,
        replace=True,
        n_samples=len(X_majority),
        random_state=random_state
    )

    # Combine majority and upsampled minority
    X_oversampled = np.vstack((X_majority, X_minority_upsampled))
    y_oversampled = np.concatenate((y_majority, y_minority_upsampled))

    return X_oversampled, y_oversampled


In [15]:
# After you do your train_test_split:
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    label_pd,
    test_size=0.2,
    random_state=42
)

y_train_config = y_train['has_config_changes'].values
y_test_config = y_test['has_config_changes'].values

y_train_code = y_train['has_code_changes'].values
y_test_code = y_test['has_code_changes'].values

# -----------------------------
# Oversample for has_config_changes
# -----------------------------
X_train_config_os, y_train_config_os = oversample_minority(
    X_train,
    y_train_config
)

# -----------------------------
# Oversample for has_code_changes
# -----------------------------
X_train_code_os, y_train_code_os = oversample_minority(
    X_train,
    y_train_code
)


In [16]:


class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [17]:
def plot_confusion_matrix(cm, title='Confusion Matrix'):
    """
    Plot a confusion matrix using seaborn's heatmap.
    """
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cbar=False)
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

In [18]:
def train_model(model, X_train, y_train, X_test, y_test, epochs=5, lr=1e-3):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float).to(device)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        outputs = model(X_train_tensor).squeeze()  # raw logits
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        test_acc, _ = evaluate_model(model, X_test, y_test)
        # print(
        #     f"Epoch [{epoch + 1}/{epochs}], "
        #     f"Loss: {loss.item():.4f}, "
        #     f"Test Accuracy: {test_acc:.4f}"
        # )


In [19]:
def evaluate_model(model, X_test, y_test):
    model.eval()
    X_tensor = torch.tensor(X_test, dtype=torch.float).to(device)
    with torch.no_grad():
        outputs = model(X_tensor).squeeze()  # (batch_size,) raw logits
        preds = torch.sigmoid(outputs)
        preds = (preds > 0.5).long().cpu().numpy()
    accuracy = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    return accuracy, cm

In [20]:
# use logistic regression to predict if the issue will contain config file change
# increase the epochs to 10
from sklearn.linear_model import LogisticRegression
config_lr = LogisticRegression(max_iter=100)
config_lr.fit(X_train_config_os, y_train_config_os)
config_lr_acc = config_lr.score(X_test, y_test_config)
print("Logistic Regression Accuracy (has_config_changes):", config_lr_acc)

Logistic Regression Accuracy (has_config_changes): 0.676829268292683


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# 1) Model for `has_config_changes`
input_dim = X_train.shape[1]
config_model = SimpleNN(input_dim=input_dim, hidden_dim=128).to(device)

print("Training model for `has_config_changes` (oversampled)...")
train_model(
    config_model,
    X_train_config_os,   # oversampled X
    y_train_config_os,   # oversampled y
    X_test,
    y_test_config,
    epochs=10,
    lr=1e-5
)
acc_config, cm_config = evaluate_model(config_model, X_test, y_test_config)
print("Final Test Accuracy (has_config_changes):", acc_config)



Training model for `has_config_changes` (oversampled)...
Final Test Accuracy (has_config_changes): 0.7195121951219512


In [30]:
# 2) Model for `has_code_changes`
code_model = SimpleNN(input_dim=input_dim, hidden_dim=128).to(device)

print("\nTraining model for `has_code_changes` (oversampled)...")
train_model(
    code_model,
    X_train_code_os,   # oversampled X
    y_train_code_os,   # oversampled y
    X_test,
    y_test_code,
    epochs=5,
    lr=1e-5
)
acc_code, cm_code = evaluate_model(code_model, X_test, y_test_code)
print("Final Test Accuracy (has_code_changes):", acc_code)



Training model for `has_code_changes` (oversampled)...
Final Test Accuracy (has_code_changes): 0.8170731707317073
