In [27]:
from datasets import load_dataset, Dataset
import pandas as pd

ds = load_dataset("christopher/rosetta-code", split='train')
df = Dataset.to_pandas(ds)

In [152]:
def get_language_subsets(df: pd.DataFrame, lang1: str, lang2: str):
    df1 = df[df['language_name'] == lang1]
    df2 = df[df['language_name'] == lang2]
    
    subset1 = df1[df1['task_name'].isin(df2['task_name'])].sort_values('task_name')
    subset2 = df2[df2['task_name'].isin(df1['task_name'])].sort_values('task_name')
    
    # Remove comments
    subset1 = subset1.loc[subset1['task_name'] != 'Comments']
    subset2 = subset2.loc[subset2['task_name'] != 'Comments']
    
    return subset1, subset2


def select_code_clones(df: pd.DataFrame, lang1: str, lang2: str, n_clones=10, random_state=0) -> pd.DataFrame:
    
    subset_1, subset_2 = get_language_subsets(df, lang1, lang2)
    
    sampled_1 = subset_1.sample(n=n_clones, random_state=random_state)
    sampled_2 = subset_2[subset_2['task_name'].isin(sampled_1['task_name'])]

    sampled_1 = sampled_1[['task_name', 'code']].rename(columns={'code': 'code_1'})
    sampled_2 = sampled_2[['task_name', 'code']].rename(columns={'code': 'code_2'})

    combined_df = pd.merge(sampled_1, sampled_2, on='task_name')

    combined_df = combined_df.sort_values('task_name').reset_index(drop=True)

    return combined_df


In [153]:
java_fortran_clones = select_code_clones(df, 'Java', 'Fortran')
python_cobol_clones = select_code_clones(df, 'Python', 'COBOL')
js_pascal_clones = select_code_clones(df, 'JavaScript', 'Free Pascal')

In [2]:
# Experimental setup

# https://huggingface.co/Salesforce/codet5p-770m
# https://github.com/lairikeqiA/ZC3

from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")

In [None]:
# Zero-shot evaluation