In [17]:
import pandas as pd 
import re
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig



In [10]:
data = pd.read_excel('flaky_data.xlsx', index_col=False)

In [11]:
df = pd.DataFrame(data, columns=['Language', 'label', 'test case content'])
df.rename(columns={'test case content': 'text'}, inplace=True)
df['tokens'] = pd.Series()

In [12]:
df.Language.value_counts()

Language
Java      1904
Python     210
go          90
C++         78
JS          61
PHP          2
Name: count, dtype: int64

In [13]:
df.drop(df[df['Language'] == 'PHP'].index, inplace=True)
df.Language.value_counts()

Language
Java      1904
Python     210
go          90
C++         78
JS          61
Name: count, dtype: int64

In [14]:
df.label.value_counts()

label
NonFlaky    1321
Flaky       1022
Name: count, dtype: int64

In [15]:
### filter the dataset according to languages

java_df = df[df['Language'] == 'Java']
python_df = df[df['Language'] == 'Python']
go_df = df[df['Language'] == 'go']
cpp_df = df[df['Language'] == 'C++']
js_df = df[df['Language'] == 'JS']
java_js_df = df[df['Language'].isin(['Java', 'JS'])]
java_py_cpp = df[df['Language'].isin(['Java', 'Python', 'C++'])]
no_java = df[~df['Language'].isin(['Java'])]

In [None]:
def classification(data, model):
    
    model = AutoModelForCausalLM.from_pretrained(model)
    tokenizer = AutoTokenizer.from_pretrained(model)
    
    # processed_idxs = []
    predictions = []
    yes_prob = []
    gt = []
    n_skipped_functions = 0
    
    generation_config = GenerationConfig(
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        min_new_tokens=1,
        max_new_tokens=3,
        
    )
    
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        text = row['text']
        label = row['label']
        
        
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Generate predictions
        with torch.no_grad():
            outputs = model.generate(**inputs, generation_config=generation_config)
        
        # Decode the generated tokens
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract the prediction from the decoded output
        prediction = re.search(r'\b(yes|no)\b', decoded_output, re.IGNORECASE)
        
        if prediction:
            pred_label = prediction.group(0).lower()
            predictions.append(pred_label)
            yes_prob.append(pred_label == 'yes')
            gt.append(label)
            
  
        
    
    
    
    
    