In [2]:
import json
import pandas as pd
import re
from tqdm import tqdm
import os

def clean_comment(s):
    s = s.replace("#", "")
    s = s.replace("-", "")
    s = s.replace("=", "")
    return s

def remove_empty_lines(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        lines = file.readlines()

    # Remove empty lines
    lines = [line for line in lines if line.strip() != '']

    with open(file_path, 'w') as file:
        file.writelines(lines)

def create_comment_code_pairs(file_path):
    comment_code_pairs = {}
    current_comment = ''
    library_found = False
    exist = False
    actual = ''

    with open(file_path, 'r', encoding='latin-1') as file:
        for line in file:
            if 'library(' in line:
                exist = True
                break

        file.seek(0)  # Reset the file pointer to the beginning

        for line in file:
            line = line.strip()

            if line.startswith('library('):
                # Found a library calling line
                library_found = True
            elif library_found or (not exist):
                if line.startswith('#'):
                    # Found a comment line after the last library calling
                    current_comment = str(current_comment) + str(line) + " "
                    actual = clean_comment(current_comment)
                elif actual and (has_multiple_words(actual)):
                    # Found a code line following a comment
                    if (actual not in comment_code_pairs):
                        comment_code_pairs[actual] = ''

                    comment_code_pairs[actual]+= str(line) + ' '
                    current_comment = ''
                else:
                  current_comment = ''

    return comment_code_pairs

def remove_items_without_code(json_file_path):
    # Read the JSON file
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)

    # Remove items without code
    data = [item for item in data if item.get('code') != ([""] and ["",""])]

    # Write the updated JSON file
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=2)

def has_multiple_words(text):
    words = re.findall(r'\b[a-zA-Z]+\b', text)
    return len(words) > 1

In [3]:
dict = {'idx': [],
        'comment':[],
        'code':[]
       }

df = pd.DataFrame(dict)


directory_path = '/content/R-code-data/R_code_files/OSF_R_files'  # Replace with the path to your R code directory

# Get a list of all files in the directory
file_list = os.listdir(directory_path)
# Iterate over the files and read them
for file_name in tqdm(file_list):
      if file_name.endswith('.R') or file_name.endswith('.r'):

        file_path = os.path.join(directory_path, file_name)
        try:
          remove_empty_lines(file_path)
        except:
          pass
        pairs = create_comment_code_pairs(file_path)
        comment_code_pairs = []
        # Print the comment-code pairs
        if pairs:
          j=0
          for comment, code_lines in pairs.items():
                                    idx = file_path.split('/')[-1] + ' ' + str(j)
                                    df.loc[len(df.index)] = [idx, comment, code_lines]
                                    j+=1

100%|██████████| 9844/9844 [19:29<00:00,  8.42it/s]


In [4]:
len(df)

122198

In [6]:
df.head(10)

Unnamed: 0,idx,comment,code
0,7953 model_selection.R 0,Model Selection EXPERIMENT ONE,"rm(list=ls()) your_directory_here <- ""/home/lu..."
1,7953 model_selection.R 1,Model Selection Note the manuscript reports W...,"load_model (""LBA"", ""lbaN_B_old.R"") load(""sampl..."
2,7953 model_selection.R 2,Summed DIC [1] 107341.3,"rm(OnlyCapacity_Ex1) load(""samples/OnlyProacti..."
3,7953 model_selection.R 3,Summed DIC [1] 109934.2,"rm(OnlyProactive_Ex1) load(""samples/PMDCex.RDa..."
4,7953 model_selection.R 4,Summed DIC [1] 110007.1,"rm(PMDCex_Ex1) load(""samples/PMDCinh.RData"") h..."
5,7953 model_selection.R 5,Summed DIC [1] 110246.3,"rm(PMDCinh_Ex1) load(""samples/PMDC.RData"") h.I..."
6,7953 model_selection.R 6,Summed DIC [1] 110334.6,"rm(PMDC_Ex1) load(""samples/top_samples_Ex1.RDa..."
7,7953 model_selection.R 7,Summed DIC [1] 110528.4,rm(top_samples_Ex1)
8,7953 model_selection.R 8,EXPERIMENT TWO,"rm(list=ls()) your_directory_here <- ""/home/lu..."
9,7953 model_selection.R 9,Summed DIC [1] 43336.96,"rm(OnlyCapacity_Ex2) load(""samples/OnlyProacti..."


In [7]:
df[:10000].to_csv('/content/drive/MyDrive/CodeInspector/Data/Comment-code pairs/code-comment-pairs-m-words-10K.csv', index=False, escapechar=';')
df.to_csv('/content/drive/MyDrive/CodeInspector/Data/Comment-code pairs/code-comment-pairs-m-words-All.csv', index=False, escapechar=';')