In [8]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser

PY_LANGUAGE = Language(tspython.language())

In [None]:
from collections import Counter

import pandas as pd
import torch
from sklearn.model_selection import train_test_split

# Set up parser
parser = Parser(PY_LANGUAGE)

# Sample code snippets and their binary labels (e.g., 0 = clean, 1 = vulnerable)
samples = [
    (b"def add(a, b): return a + b", 0),
    (b"def eval_input(): eval(input())", 1),  # potentially dangerous use
]

# List all possible node types we'll count
node_types = set()


def walk_tree(node, types):
    types.append(node.type)
    for child in node.children:
        walk_tree(child, types)


# First pass: gather all node types
for code, _ in samples:
    tree = parser.parse(code)
    types = []
    walk_tree(tree.root_node, types)
    node_types.update(types)

# Create fixed feature vector mapping
node_types = sorted(node_types)
type_to_idx = {typ: i for i, typ in enumerate(node_types)}


def code_to_feature_vector(code):
    tree = parser.parse(code)
    types = []
    walk_tree(tree.root_node, types)
    counts = Counter(types)
    feature_vector = [counts.get(typ, 0) for typ in node_types]
    return torch.tensor(feature_vector, dtype=torch.float32)


df_gen = pd.read_csv("../../data/generated/gen_solutions.csv")
df_gen = df_gen[["spec", "solution"]]
df_gen.columns = ["task", "text"]
df_gen["generated"] = 1
df_real = pd.read_csv("../../data/db_attempts.csv")
df_real = df_real[["task", "programText"]]
df_real.columns = ["task", "text"]
df_real["generated"] = 0
df = pd.concat([df_gen, df_real], axis=0, ignore_index=True)
df = df.dropna(subset=["text"])
df["text"] = df["text"].astype(str)
df.drop_duplicates()
df["id"] = df.index + 1

df = df.reset_index(drop=True)
train_df, val_prep = train_test_split(df, test_size=0.2, stratify=df["generated"])
valid_df, test_df = train_test_split(
    val_prep, test_size=0.25, stratify=val_prep["generated"]
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

X = torch.stack([code_to_feature_vector(code) for code, _ in samples])
y = torch.tensor([label for _, label in samples], dtype=torch.long)

In [12]:
node_types

['(',
 ')',
 '+',
 ',',
 ':',
 'argument_list',
 'binary_operator',
 'block',
 'call',
 'def',
 'expression_statement',
 'function_definition',
 'identifier',
 'module',
 'parameters',
 'return',
 'return_statement']

In [10]:
import torch.nn as nn
import torch.optim as optim


class CodeClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=32):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 2)  # binary classification

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))


model = CodeClassifier(input_dim=len(node_types))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Simple training loop
for epoch in range(30):
    outputs = model(X)
    loss = criterion(outputs, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f"Final Loss: {loss.item():.4f}")

Final Loss: 0.0004
