# Solution based on AST (on old dataset)

In [35]:
from collections import Counter

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import tree_sitter_python as tspython
from sklearn.metrics import (
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from tree_sitter import Language, Parser

PY_LANGUAGE = Language(tspython.language())
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

## Utils

In [36]:
def set_seed(seed: int = 420):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

## Data Loading

In [37]:
set_seed()
df = pd.read_csv("../../data/generated/dataset_old.csv")

print(f"Total size: {len(df)}\n")

train_df, val_prep = train_test_split(df, test_size=0.3, stratify=df["generated"])
valid_df, test_df = train_test_split(
    val_prep, test_size=0.3, stratify=val_prep["generated"]
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(valid_df)}")
print(f"Test size: {len(test_df)}")

Total size: 9251

Train size: 6475
Validation size: 1943
Test size: 833


Check that the data is balanced

In [38]:
print(f"Mean generated (train): {train_df['generated'].mean()}")
print(f"Mean generated (validation): {valid_df['generated'].mean()}")
print(f"Mean generated (test): {test_df['generated'].mean()}")

Mean generated (train): 0.3567567567567568
Mean generated (validation): 0.3566649511065363
Mean generated (test): 0.3565426170468187


## Dataset building

In [39]:
parser = Parser(PY_LANGUAGE)
node_types = set()


def walk_tree(node, types):
    types.append(node.type)
    for child in node.children:
        walk_tree(child, types)


def code_to_feature_vector(code: bytes, device=DEVICE) -> torch.Tensor:
    tree = parser.parse(code)
    types = []
    walk_tree(tree.root_node, types)
    counts = Counter(types)
    feature_vector = [counts.get(typ, 0) for typ in node_types]
    return torch.tensor(feature_vector, dtype=torch.float32, device=device)


# Gather all node types
for _, row in train_df.iterrows():
    tree = parser.parse(str.encode(row["code"]))
    types = []
    walk_tree(tree.root_node, types)
    node_types.update(types)

node_types = sorted(node_types)
type_to_idx = {typ: i for i, typ in enumerate(node_types)}

Save node types for inference

In [40]:
with open("../../data/ast/node_types_old.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(node_types))

In [41]:
class ASTDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __getitem__(self, index) -> tuple[str, torch.Tensor, float]:
        raw_code = self.dataframe["code"].iloc[index]
        return (
            raw_code,
            code_to_feature_vector(raw_code.encode("utf-8")),
            float(self.dataframe["generated"].iloc[index]),
        )

    def __len__(self):
        return len(self.dataframe)


data_train = ASTDataset(dataframe=train_df)
dataloader_train = DataLoader(data_train, batch_size=32)
data_val = ASTDataset(dataframe=valid_df)
dataloader_val = DataLoader(data_val, batch_size=128)
data_test = ASTDataset(dataframe=test_df)
dataloader_test = DataLoader(data_test, batch_size=128)

## Model definition

In [42]:
class AIDetector(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 32):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid(),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layers(x)

## Training

In [43]:
def compute_metrics(predictions: list[float], labels: list[float]) -> dict:
    predictions_rounded = [round(x) for x in predictions]
    labels_rounded = [round(x) for x in labels]

    return {
        "recall": recall_score(labels_rounded, predictions_rounded),
        "roc_auc": roc_auc_score(labels_rounded, predictions_rounded),
        "f1": f1_score(labels_rounded, predictions_rounded),
        "mae": mean_absolute_error(labels, predictions),
        "mse": mean_squared_error(labels, predictions),
    }


def metrics_str(metrics: dict) -> str:
    return " | ".join([f"{key.upper()}: {value:.4f}" for key, value in metrics.items()])


def train_model(
    model: nn.Module, dataloader: DataLoader, criterion, optimizer: optim.Optimizer
):
    losses = []
    model.train()
    for _, code, label in tqdm(dataloader, desc="Training"):
        code, label = code.float().to(DEVICE), label.float().to(DEVICE)
        outputs = model(code)
        outputs = outputs.squeeze()
        loss = criterion(outputs, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return np.mean(losses)


def evaluate_model(model: nn.Module, dataloader: DataLoader) -> dict:
    model.eval()
    with torch.no_grad():
        all_predictions = []
        all_truths = []
        for real_code, code, label in tqdm(dataloader, desc="Validation"):
            code, label = code.float().to(DEVICE), label.float().to(DEVICE)
            outputs = model(code)
            outputs = outputs.squeeze()

            all_predictions.extend(outputs.detach().cpu().numpy().tolist())
            all_truths.extend(label.detach().cpu().numpy().tolist())

        return compute_metrics(all_predictions, all_truths)


def train_eval_loop(
    model: nn.Module,
    dataloader_train: DataLoader,
    dataloader_val: DataLoader,
    criterion,
    optimizer: optim.Optimizer,
    epochs: int = 5,
    early_stopping: int = 3,
    maximize: str = "recall",
    save_path: str = "../../data/ast/best_model_old.pth",
):
    best_score = 0 if maximize == "recall" else float("inf")
    no_improvement = 0
    mean_losses = []
    for epoch in range(1, epochs + 1):
        print(f"Epoch {epoch}/{epochs}")
        mean_loss = train_model(model, dataloader_train, criterion, optimizer)
        metrics = evaluate_model(model, dataloader_val)

        mean_losses.append(mean_loss)
        print(f"\n{metrics_str(metrics)}\n")

        score = metrics[maximize]
        if (maximize == "recall" and score > best_score) or (
            maximize == "mae" and score < best_score
        ):
            no_improvement = 0
            best_score = score
            torch.save(model.state_dict(), save_path)
        else:
            no_improvement += 1
        if no_improvement >= early_stopping:
            print("Early stopping triggered.")
            break

    return mean_losses

In [44]:
set_seed()
model = AIDetector(input_dim=len(node_types)).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)

losses = train_eval_loop(
    model,
    dataloader_train,
    dataloader_val,
    criterion,
    optimizer,
    epochs=10,
    early_stopping=5,
    maximize="recall",
)

Epoch 1/10


Training: 100%|██████████| 203/203 [00:05<00:00, 35.39it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00,  9.47it/s]



RECALL: 0.4545 | ROC_AUC: 0.6921 | F1: 0.5748 | MAE: 0.2554 | MSE: 0.2126

Epoch 2/10


Training: 100%|██████████| 203/203 [00:06<00:00, 30.90it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00,  9.84it/s]



RECALL: 0.5094 | ROC_AUC: 0.7091 | F1: 0.6086 | MAE: 0.2435 | MSE: 0.1991

Epoch 3/10


Training: 100%|██████████| 203/203 [00:07<00:00, 28.85it/s]
Validation: 100%|██████████| 16/16 [00:02<00:00,  7.80it/s]



RECALL: 0.5830 | ROC_AUC: 0.7099 | F1: 0.6211 | MAE: 0.2683 | MSE: 0.2149

Epoch 4/10


Training: 100%|██████████| 203/203 [00:06<00:00, 29.98it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00,  9.63it/s]



RECALL: 0.6205 | ROC_AUC: 0.7202 | F1: 0.6380 | MAE: 0.2660 | MSE: 0.2082

Epoch 5/10


Training: 100%|██████████| 203/203 [00:07<00:00, 28.40it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00, 12.29it/s]



RECALL: 0.5325 | ROC_AUC: 0.7134 | F1: 0.6181 | MAE: 0.2449 | MSE: 0.1973

Epoch 6/10


Training: 100%|██████████| 203/203 [00:05<00:00, 34.46it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00,  8.80it/s]



RECALL: 0.5945 | ROC_AUC: 0.7213 | F1: 0.6363 | MAE: 0.2506 | MSE: 0.2025

Epoch 7/10


Training: 100%|██████████| 203/203 [00:06<00:00, 33.73it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00, 10.04it/s]



RECALL: 0.5440 | ROC_AUC: 0.7272 | F1: 0.6379 | MAE: 0.2280 | MSE: 0.1902

Epoch 8/10


Training: 100%|██████████| 203/203 [00:06<00:00, 33.42it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00, 10.75it/s]



RECALL: 0.4589 | ROC_AUC: 0.6910 | F1: 0.5745 | MAE: 0.2468 | MSE: 0.2121

Epoch 9/10


Training: 100%|██████████| 203/203 [00:05<00:00, 37.20it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00,  8.21it/s]



RECALL: 0.7388 | ROC_AUC: 0.6990 | F1: 0.6278 | MAE: 0.3237 | MSE: 0.2490

Epoch 10/10


Training: 100%|██████████| 203/203 [00:06<00:00, 32.30it/s]
Validation: 100%|██████████| 16/16 [00:01<00:00, 10.35it/s]


RECALL: 0.7403 | ROC_AUC: 0.6813 | F1: 0.6114 | MAE: 0.3356 | MSE: 0.2752






## Testing

In [45]:
def test_model(model: nn.Module, dataloader: DataLoader) -> tuple[pd.DataFrame, dict]:
    model.eval()
    with torch.no_grad():
        all_predictions = []
        all_truths = []
        all_codes = []
        for real_code, code, label in tqdm(dataloader, desc="Validation"):
            code, label = code.float().to(DEVICE), label.float().to(DEVICE)
            outputs = model(code)
            outputs = outputs.squeeze()

            all_predictions.extend(outputs.detach().cpu().numpy().tolist())
            all_truths.extend(label.detach().cpu().numpy().tolist())
            all_codes.extend(real_code)

        test_df = pd.DataFrame(
            {"code": all_codes, "real": all_truths, "predicted": all_predictions}
        )
        return test_df, compute_metrics(all_predictions, all_truths)

In [46]:
best_model = AIDetector(input_dim=len(node_types)).to(DEVICE)
best_model.load_state_dict(torch.load("../../data/ast/best_model_old.pth"))
test_df, test_metrics = test_model(best_model, dataloader_test)
print(metrics_str(test_metrics))

test_df.to_csv("../../data/ast/test_results_old.csv")

Validation: 100%|██████████| 7/7 [00:00<00:00, 10.69it/s]

RECALL: 0.7172 | ROC_AUC: 0.6459 | F1: 0.5772 | MAE: 0.3725 | MSE: 0.3189





## Inference

In [47]:
def detect_ai_code(code: str) -> float:
    with open("../../data/ast/node_types_old.txt", "r", encoding="utf-8") as f:
        node_types_loaded = f.readlines()

    loaded_model = AIDetector(input_dim=len(node_types_loaded))
    loaded_model.load_state_dict(torch.load("../../data/ast/best_model_old.pth"))

    code_vectorized = code_to_feature_vector(
        code.encode("utf-8"), device=torch.device("cpu")
    ).unsqueeze(0)
    with torch.no_grad():
        prediction = loaded_model(code_vectorized).squeeze().cpu().item()
    return prediction

In [48]:
code1 = """
a,b = map(int, input().split())
if a > b:
    return 1
return 0
"""

code2 = """
x, y = map(int, input().split())
return int(x>y)
"""

code3 = """
l = map(int, input().split())
if l[0] > l[1] :
    return 1
else:
    return 0
"""
for c in [code1, code2, code3]:
    print(f"{'*' * 15}\nCODE:\n{c}\nPREDICTION: {detect_ai_code(c):.4f}\n")

***************
CODE:

a,b = map(int, input().split())
if a > b:
    return 1
return 0

PREDICTION: 0.9982

***************
CODE:

x, y = map(int, input().split())
return int(x>y)

PREDICTION: 0.9855

***************
CODE:

l = map(int, input().split())
if l[0] > l[1] :
    return 1
else:
    return 0

PREDICTION: 0.9903

