In [73]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/reviewdataset/val.csv
/kaggle/input/reviewdataset/train.csv
/kaggle/input/reviewdataset/test.csv


In [74]:
import pandas as pd
import re  
from transformers import RobertaTokenizer, RobertaModel, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load datasets
train_df = pd.read_csv('/kaggle/input/reviewdataset/train.csv')
val_df = pd.read_csv('/kaggle/input/reviewdataset/val.csv')
test_df = pd.read_csv('/kaggle/input/reviewdataset/test.csv')
print(train_df)

       Unnamed: 0                                            comment  \
0               0  String.isEmpty() is avaible only as of JDK 1.6...   
1               1  Does \s include zero-width non-joiner and stuf...   
2               2  Should we log or re-throw any other surprise e...   
3               3  You could collapse this and the new constructo...   
4               4  Do we really need a toast for "bookmarking..."...   
...           ...                                                ...   
13751       13751                                       Oops, right.   
13752       13752  if (position == 0) probably better expresses w...   
13753       13753                             `if (!this.hasNext())`   
13754       13754  I modified the test to ensure that if `interse...   
13755       13755                                     Que es este 1?   

                                                  before  \
0      \tprotected static String commentFormat(String...   
1          publ

In [75]:
# Function to clean the text data
def clean_text(text):
    # Remove special characters and numbers (optional: you may want to keep some symbols)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)

    # Convert to lowercase
    text = text.lower()

    # Remove extra spaces
    text = text.strip()

    # Tokenization (splitting text into words)
    tokens = text.split()

    # Joining tokens back to string
    text = ' '.join(tokens)

    return text

In [76]:
# Apply the cleaning function to your text columns
for df in [train_df, val_df, test_df]:
    df['comment'] = df['comment'].apply(clean_text)
    df['before'] = df['before'].apply(clean_text)
    df['before_marked'] = df['before_marked'].apply(clean_text)
    df['after'] = df['after'].apply(clean_text)

    # Optionally, drop the 'Unnamed: 0' column if not needed
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)
print(df)

                                                comment  \
0                      i think library are no more used   
1            consider using systemlineseparator instead   
2     changedetaileditoregerritipvalue has no more a...   
3                             should be called resource   
4                                      just return rule   
...                                                 ...   
1714  shouldnt you force the put otherwise what is r...   
1715  openssl also supports it in a similar way on t...   
1716  a beforeclass has to be static it doesnt tell ...   
1717  would it make more sense to mark the brand par...   
1718                       id rather throw an exception   

                                                 before  \
0     private static setstring getrootqualifiersreso...   
1     public runtimefiltermanagerqueryworkunit worku...   
2     private void activatemarkers if fgerritclientg...   
3     public void testcreate throws stripeexception ...

In [77]:
# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize a text column (example: 'comment')
def tokenize_text(df, text_column):
    return tokenizer(
        list(df[text_column]), 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )

In [78]:
from transformers import RobertaForSequenceClassification, RobertaConfig

num_classes = 3 

# Load the configuration from the pre-trained RoBERTa model
config = RobertaConfig.from_pretrained('roberta-base', num_labels=num_classes)

# Instantiate the model
model = RobertaForSequenceClassification(config)

# Print the model's architecture
print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [79]:
# Tokenize the 'comment' column for each dataset
train_tokens = tokenize_text(train_df, 'comment')
val_tokens = tokenize_text(val_df, 'comment')
test_tokens = tokenize_text(test_df, 'comment')

# Example of how tokens look like
print(train_tokens)

{'input_ids': tensor([[    0, 20951,   354,  ...,     1,     1,     1],
        [    0, 26692,   579,  ...,     1,     1,     1],
        [    0, 17276,    52,  ...,     1,     1,     1],
        ...,
        [    0,  1594,    42,  ...,     1,     1,     1],
        [    0,   118, 10639,  ...,     1,     1,     1],
        [    0,  3407,  2714,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [80]:
# Logic to decide if a comment is relevant or not
def is_comment_relevant(comment):
    comment_words = comment.split()
    comment_size = len(comment_words)

    is_relevant = True

    # Useless comments, no content after removing stopwords
    if len(comment_words) == 0:
        is_relevant = False

    # Useless comments, one word, no action required or unclear action
    elif comment_size == 1:
        if any(word in comment for word in ['nice', 'please', 'ditto', 'thank', 'ditto2', 'fine', 'agew', 'hahaha', 'yeh', 'lol']):
            is_relevant = False

    elif comment_size == 2:
        if 'ack' in comment:
            is_relevant = False

    # Request to change formatting, no impact on code
    elif 'indent' in comment and comment_size < 5:
        is_relevant = False

    # Likely a thank you message
    elif any(word in comment for word in ['works for me', 'sounds good', 'makes sense', 'smile', 'approv']) and comment_size < 5:
        is_relevant = False

    # Request to add test code, no impact on the reviewed code
    elif ('test' in comment and comment_size < 5) or ('add' in comment and 'test' in comment):
        is_relevant = False

    # Request for clarification
    elif (any(word in comment for word in ['please explain', 'explan', 'wat', 'what']) and comment_size < 5) or \
         (any(word in comment for word in ['understand', 'meant']) and 'not sure' in comment):
        is_relevant = False

    # Refers to previous comment or external resource with unclear action point
    elif (any(word in comment for word in ['same as', 'same remark', 'said above', 'do the same']) and comment_size < 5):
        is_relevant = False

    # Refers to web pages
    elif (any(word in comment for word in ['like', 'see']) and 'http' in comment):
        is_relevant = False

    # Request to add comment
    elif any(word in comment for word in ['document', 'javadoc', 'comment']):
        is_relevant = False

    # Feedback about reorganizing the PR
    elif 'pr' in comment and comment_size < 5:
        is_relevant = False

    # Comment contains a +1 to support previous comment. It may be accompanied by another word, like agree or a smile.
    elif '+1' in comment and comment_size < 3:
        is_relevant = False

    # The code is ok for now
    elif 'for now' in comment and comment_size < 5:
        is_relevant = False

    # Answers
    elif any(word in comment for word in ['fixed', 'thank', 'youre right']) and comment_size < 3:
        is_relevant = False

    return is_relevant

In [81]:
import nltk
from nltk.corpus import stopwords

# Download the NLTK stopwords if you haven't already
nltk.download('stopwords')

# Define the stopwords as a list
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
# Apply the relevance logic to label your data
train_labels = [1 if is_comment_relevant(comment) else 0 for comment in train_df['comment']]
val_labels = [1 if is_comment_relevant(comment) else 0 for comment in val_df['comment']]
test_labels = [1 if is_comment_relevant(comment) else 0 for comment in test_df['comment']]

# 1 indicates a relevant comment, 0 indicates a non-relevant comment

# Example usage:
print(train_labels)  

[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [83]:
from torch.utils.data import Dataset

class CodeReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
train_dataset = CodeReviewDataset(train_tokens, train_labels)
val_dataset = CodeReviewDataset(val_tokens, val_labels)
test_dataset = CodeReviewDataset(test_tokens, test_labels)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    # Training
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    for batch in val_loader:
        pass

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [85]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Define a function to calculate evaluation metrics
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            predictions.extend(preds)
            true_labels.extend(label_ids)

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    return accuracy, precision, recall, f1

In [86]:
# Evaluate on validation set
val_accuracy, val_precision, val_recall, val_f1 = evaluate_model(model, val_loader, device)
print(f"Validation - Accuracy: {val_accuracy}, Precision: {val_precision}, Recall: {val_recall}, F1 Score: {val_f1}")

# Save the model
torch.save(model.state_dict(), 'roberta_code_review_model.pth')

# Evaluate on test set
test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(model, test_loader, device)
print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, msg_start, len(result))


Validation - Accuracy: 0.9744037230948226, Precision: 0.9494626155810517, Recall: 0.9744037230948226, F1 Score: 0.9617715004029628


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test - Accuracy: 0.9767306573589296, Precision: 0.9540027770248068, Recall: 0.9767306573589296, F1 Score: 0.9652329450886656


  _warn_prf(average, modifier, msg_start, len(result))


In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report

X_train, y_train = train_df['comment'], train_labels
X_val, y_val = val_df['comment'], val_labels
X_test, y_test = test_df['comment'], test_labels

In [88]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [96]:
# Define the models
models = {
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier(),
    "NaiveBayes": MultinomialNB(),
    "SVM": SVC()
}

In [97]:
# Train and evaluate each model
for name, model in models.items():
    # Training
    model.fit(X_train_tfidf, y_train)

    # Validation
    y_val_pred = model.predict(X_val_tfidf)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"{name} Validation Accuracy: {val_accuracy}")
    report = classification_report(y_val, y_val_pred)
    print(f"Classification Report for {model}:")
    print(report)

RandomForest Validation Accuracy: 0.9773123909249564
Classification Report for RandomForestClassifier():
              precision    recall  f1-score   support

           0       0.73      0.18      0.29        44
           1       0.98      1.00      0.99      1675

    accuracy                           0.98      1719
   macro avg       0.85      0.59      0.64      1719
weighted avg       0.97      0.98      0.97      1719

LogisticRegression Validation Accuracy: 0.9744037230948226
Classification Report for LogisticRegression():
              precision    recall  f1-score   support

           0       0.50      0.02      0.04        44
           1       0.97      1.00      0.99      1675

    accuracy                           0.97      1719
   macro avg       0.74      0.51      0.52      1719
weighted avg       0.96      0.97      0.96      1719

DecisionTree Validation Accuracy: 0.9738219895287958
Classification Report for DecisionTreeClassifier():
              precision    re

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Validation Accuracy: 0.9749854566608493
Classification Report for SVC():
              precision    recall  f1-score   support

           0       1.00      0.02      0.04        44
           1       0.97      1.00      0.99      1675

    accuracy                           0.97      1719
   macro avg       0.99      0.51      0.52      1719
weighted avg       0.98      0.97      0.96      1719

