# Solution based on AST

In [None]:
from collections import Counter

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import tree_sitter_python as tspython
from sklearn.metrics import (
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from tree_sitter import Language, Parser

from transformers import T5Tokenizer, T5ForConditionalGeneration

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree

from sklearn.metrics import recall_score, roc_auc_score, f1_score, mean_squared_error, mean_absolute_error

import uuid
import matplotlib.pyplot as plt


PY_LANGUAGE = Language(tspython.language())
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

  from .autonotebook import tqdm as notebook_tqdm


device(type='cpu')

## Utils

In [None]:
def set_seed(seed: int = 420):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def calculate_metrics_classifier(model:str, depth, true_label, predicted):

    f1 = f1_score(true_label, predicted)
    roc_auc = roc_auc_score(true_label, predicted)
    recall = recall_score(true_label, predicted)
    # mse = mean_squared_error(true_label, predicted)
    # mae = mean_absolute_error(true_label, predicted)

    print(f"Model : {model} with depth {depth} | F1 : {f1} | ROC/AUC : {roc_auc} | RECALL : {recall} ")

def calculate_metrics_regressor(model:str, depth, true_label, predicted):

    # f1 = f1_score(true_label, predicted)
    # roc_auc = roc_auc_score(true_label, predicted)
    # recall = recall_score(true_label, predicted)
    mse = mean_squared_error(true_label, predicted)
    mae = mean_absolute_error(true_label, predicted)

    print(f"Model : {model} with depth {depth} | MAE : {mae} | MSE : {mse}  ")


def uuid_to_int(uuid_str):
    return uuid.UUID(uuid_str).int % (2**64) 




## Data Loading

In [3]:
set_seed()
df = pd.read_csv("../../data/generated/dataset.csv")

print(f"Total size: {len(df)}\n")

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["generated"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Total size: 12428

Train size: 9942
Test size: 2486


Check that the data is balanced

In [4]:
print(f"Mean generated (train): {train_df['generated'].mean()}")
print(f"Mean generated (test): {test_df['generated'].mean()}")

Mean generated (train): 0.5211225105612552
Mean generated (test): 0.5213193885760258


## Dataset building

In [5]:
parser = Parser(PY_LANGUAGE)
node_types = set()


def walk_tree(node, types):
    types.append(node.type)
    for child in node.children:
        walk_tree(child, types)


def code_to_feature_vector(code, device=DEVICE) :
    code = code.encode('utf-8')
    tree = parser.parse(code)
    types = []
    walk_tree(tree.root_node, types)
    counts = Counter(types)
    feature_vector = [counts.get(typ, 0) for typ in node_types]
    
    return feature_vector


# Gather all node types
for _, row in train_df.iterrows():
    tree = parser.parse(str.encode(row["code"]))
    types = []
    walk_tree(tree.root_node, types)
    node_types.update(types)

node_types = sorted(node_types)
type_to_idx = {typ: i for i, typ in enumerate(node_types)}

Save node types for inference

In [6]:
with open("../../data/ast/node_types.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(node_types))

In [7]:
dataset_train = train_df
dataset_test = test_df

In [8]:
for df, i in zip([train_df, test_df], [1, 2]):
    features_df = pd.DataFrame(df['code'].apply(code_to_feature_vector).apply(pd.Series))
    df.drop('code', axis=1, inplace=True)
    df = pd.concat([df, features_df], axis=1)

    if i == 1:
        dataset_train = df

    else:
        dataset_test = df

In [9]:
dataset_train_target = dataset_train['generated']
dataset_train = dataset_train.drop(['generated'], axis=1) 


dataset_test_target = dataset_test['generated']
dataset_test = dataset_test.drop(['generated'], axis=1)

In [10]:
dataset_train['task'] =  dataset_train['task'].apply(uuid_to_int)
dataset_test['task'] = dataset_test['task'].apply(uuid_to_int)

In [11]:
dataset_test.columns = dataset_test.columns.astype(str)

dataset_train.columns = dataset_train.columns.astype(str)

In [12]:
full_dataset = pd.concat([dataset_test, dataset_train])
full_target = pd.concat([dataset_test_target, dataset_train_target])

## Model definition

In [13]:
RFC = RandomForestClassifier(n_estimators=20)
RFR = RandomForestRegressor(n_estimators=20)

DTC = DecisionTreeClassifier()
DTR = DecisionTreeRegressor()

## Training

In [14]:
RFC.fit(dataset_train, dataset_train_target)

pred_val = RFC.predict(dataset_test)

calculate_metrics_classifier("Random Forest Classifier",pred_val, dataset_test_target)

Model : Random Forest Classifier | F1 : 0.8595679012345679 | ROC/AUC : 0.85331336238199 | RECALL : 0.8595679012345679 


In [15]:
RFR.fit(dataset_train, dataset_train_target)

pred_val = RFR.predict(dataset_test)

calculate_metrics_regressor("Random Forest Regressor", pred_val, dataset_test_target)

Model : Random Forest Regressor | MAE : 0.23202935865554866 | MSE : 0.11094360495046442  


## Testing

In [16]:
DTC.fit(dataset_train, dataset_train_target)

pred_val = DTC.predict(dataset_test)

calculate_metrics_classifier("Decision Tree Classifier", pred_val, dataset_test_target)

Model : Decision Tree Classifier | F1 : 0.8098207326578332 | ROC/AUC : 0.803380776004973 | RECALL : 0.8181102362204724 


In [17]:
DTR.fit(dataset_train, dataset_train_target)

pred_val = DTR.predict(dataset_test)

calculate_metrics_regressor("Decision Tree Regressor", pred_val, dataset_test_target)

Model : Decision Tree Regressor | MAE : 0.19591178901665451 | MSE : 0.18174165604474157  


In [None]:
node_types.insert(0, 'task')

In [None]:
def plot_dtree(tree, model_name, depth):
  plt.figure(figsize=(20,10))
  plot_tree(tree, 
            feature_names=node_types, 
            class_names=['1','0'], 
            rounded=True,  
            max_depth=depth,
            filled=True)
  
  plt.savefig(f"../../data/picture/{model_name}_with_depth_{depth}.svg")  
  plt.close() 

In [76]:
def compare_depth(model, max_depth:int, classifier: bool = True):
    for depth in range(2, max_depth+1):
        model.max_depth = depth

        model.fit(dataset_train, dataset_train_target)

        pred_val = model.predict(dataset_test)
        if classifier:
            model_name = "Decision Tree Classifier"
            calculate_metrics_classifier(model_name, depth, pred_val, dataset_test_target)
        else:
            model_name = "Decision Tree Regressor"
            calculate_metrics_regressor(model_name,depth,  pred_val, dataset_test_target)

        plot_dtree(model, model_name, depth)

In [77]:
RFC_dd = RandomForestClassifier(n_estimators=20)
RFR_dd = RandomForestRegressor(n_estimators=20)

DTC_dd = DecisionTreeClassifier()
DTR_dd = DecisionTreeRegressor()

In [78]:
compare_depth(DTC_dd, 10, False)

Model : Decision Tree Classifier with depth 2 | F1 : 0.6906593406593406 | ROC/AUC : 0.6308074556554343 | RECALL : 0.5362627986348123 
Model : Decision Tree Classifier with depth 3 | F1 : 0.64375 | ROC/AUC : 0.5880681818181818 | RECALL : 0.5852272727272727 
Model : Decision Tree Classifier with depth 4 | F1 : 0.6461312797946461 | ROC/AUC : 0.6111442301641712 | RECALL : 0.6156533892382949 
Model : Decision Tree Classifier with depth 5 | F1 : 0.650805270863836 | ROC/AUC : 0.6157308661626211 | RECALL : 0.6190807799442897 
Model : Decision Tree Classifier with depth 6 | F1 : 0.6455493183640738 | ROC/AUC : 0.6453710376507429 | RECALL : 0.6719532554257095 
Model : Decision Tree Classifier with depth 7 | F1 : 0.66796875 | ROC/AUC : 0.6577701267894509 | RECALL : 0.6764240506329114 
Model : Decision Tree Classifier with depth 8 | F1 : 0.6862442040185471 | ROC/AUC : 0.6727989794172099 | RECALL : 0.6873065015479877 
Model : Decision Tree Classifier with depth 9 | F1 : 0.697585281717133 | ROC/AUC :

In [79]:
compare_depth(DTR_dd, 10, False)

Model : Decision Tree Regressor with depth 2 | MAE : 0.47215734296690737 | MSE : 0.2390081422403207  
Model : Decision Tree Regressor with depth 3 | MAE : 0.45933385912814995 | MSE : 0.23330632788582395  
Model : Decision Tree Regressor with depth 4 | MAE : 0.4475400315300263 | MSE : 0.22796712998766205  
Model : Decision Tree Regressor with depth 5 | MAE : 0.43586947286415123 | MSE : 0.2239957107289368  
Model : Decision Tree Regressor with depth 6 | MAE : 0.4211537148443366 | MSE : 0.21820743134274243  
Model : Decision Tree Regressor with depth 7 | MAE : 0.4098182878124214 | MSE : 0.2166879699921979  
Model : Decision Tree Regressor with depth 8 | MAE : 0.3971072461244457 | MSE : 0.2147967503388616  
Model : Decision Tree Regressor with depth 9 | MAE : 0.38221743474014896 | MSE : 0.211329675285538  
Model : Decision Tree Regressor with depth 10 | MAE : 0.36258203793549204 | MSE : 0.2049071493984801  
