**Author:** Lisa Wallner  
**Description:** This file contains the code to clean the generated scores of the README files. The output of each LLM has to treat in a different way because the output of the LLM has not necessarily the demanded format.  
For each LLM there is a specified cleaning file.  

*Hint: If lines are created with support of a Large Language Model or the code is taken from another source, you find following hint at the end of the line: (generated with Microsoft Copilot) or (source: link_to_source)*

In [None]:
import json # package to work with .json
import pandas as pd # package for data manipulation
import re # package for regex pattern
from pathlib import Path # package to work with paths

In [None]:
# variabel is set manualley
path_range = Path('../data/output_evaluation_data_jamba_mod/model1') # create a path from given string 
all_files_range = [file.name for file in path_range.iterdir() if file.is_file()] # iterate through all elements in path_range, check if element is a file and save it in variable

In [None]:
# columns for dataframe
cols = ['repo_owner','repo_name','readme_g_score','readme_g_score_q1','readme_g_score_q1_txt','readme_g_score_q2','readme_g_score_q2_txt','readme_g_score_q3','readme_g_score_q3_txt','readme_g_score_q4','readme_g_score_q4_txt','readme_g_score_q5','readme_g_score_q5_txt','readme_o_score','readme_o_score_q1','readme_o_score_q1_txt','readme_o_score_q2','readme_o_score_q2_txt','readme_o_score_q3','readme_o_score_q3_txt','readme_o_score_q4','readme_o_score_q4_txt','readme_o_score_q5','readme_o_score_q5_txt']
df_clean_scores = pd.DataFrame(columns=cols) # create empty dataframe with specified columns from cols

In [None]:
for i in all_files_range: # iterate over all elements in all_files_range
    new_entry = {} # create a empty dictionary for each element

    # variabel is set manualley
    with open(f'../data/output_evaluation_data_jamba_mod/model1/{i}', 'r') as f: # open each file and save its content in variable
        loaded_data = json.load(f)

    repo_owner = loaded_data['repo_owner'] # specifiy repo_owner
    repo_name = loaded_data['repo_name'] # specify repo_name
    readme_g_score = loaded_data['readme_genereated']['evaluation'] # get score values for generated readme
    readme_o_score = loaded_data['readme_original']['evaluation'] # get score values for original readme

    # generated readme
    if loaded_data['readme_genereated']['score'] == [] or loaded_data['readme_genereated']['score'] == {}: # check if value of key 'score' is an empty list or dictionary
        # set variabels to '0'
        readme_g_score_q1 = '0'
        readme_g_score_q1_txt = '0'
        readme_g_score_q2 = '0'
        readme_g_score_q2_txt = '0'
        readme_g_score_q3 = '0'
        readme_g_score_q3_txt = '0'
        readme_g_score_q4 = '0'
        readme_g_score_q4_txt = '0'
        readme_g_score_q5 = '0'
        readme_g_score_q5_txt = '0'

    else: # if value for key 'score' is not empty save specific values in variabels
        readme_g_score_q1 = loaded_data['readme_genereated']['score'][0]['score']
        readme_g_score_q1_txt = loaded_data['readme_genereated']['score'][0]['explanation']
        readme_g_score_q2 = loaded_data['readme_genereated']['score'][1]['score']
        readme_g_score_q2_txt = loaded_data['readme_genereated']['score'][1]['explanation']
        readme_g_score_q3 = loaded_data['readme_genereated']['score'][2]['score']
        readme_g_score_q3_txt = loaded_data['readme_genereated']['score'][2]['explanation']
        readme_g_score_q4 = loaded_data['readme_genereated']['score'][3]['score']
        readme_g_score_q4_txt = loaded_data['readme_genereated']['score'][3]['explanation']
        readme_g_score_q5 = loaded_data['readme_genereated']['score'][4]['score']
        readme_g_score_q5_txt = loaded_data['readme_genereated']['score'][4]['explanation']
    
    # original readme
    if loaded_data['readme_original']['score'] == [] or loaded_data['readme_original']['score'] == {}: # check if value of key 'score' is an empty list or dictionary
        # set variabels to '0'
        readme_o_score_q1 = '0'
        readme_o_score_q1_txt = '0'
        readme_o_score_q2 = '0'
        readme_o_score_q2_txt = '0'
        readme_o_score_q3 = '0'
        readme_o_score_q3_txt = '0'
        readme_o_score_q4 = '0'
        readme_o_score_q4_txt = '0'
        readme_o_score_q5 = '0'
        readme_o_score_q5_txt = '0'

    else:  # if value for key 'score' is not empty save specific values in variabels
        readme_o_score_q1 = "0"
        readme_o_score_q1_txt = "0"
        readme_o_score_q2 = "0"
        readme_o_score_q2_txt = "0"
        readme_o_score_q3 = "0"
        readme_o_score_q3_txt = "0"
        readme_o_score_q4 = "0"
        readme_o_score_q4_txt = "0"
        readme_o_score_q5 = "0"
        readme_o_score_q5_txt = "0"
        
        # this steps are only required for the results of the first evaluation from the original readme files
        # original data are alread processed
        # readme_o_score_q1 = loaded_data['readme_original']['score'][0]['score']
        # readme_o_score_q1_txt = loaded_data['readme_original']['score'][0]['explanation']
        # readme_o_score_q2 = loaded_data['readme_original']['score'][1]['score']
        # readme_o_score_q2_txt = loaded_data['readme_original']['score'][1]['explanation']
        # readme_o_score_q3 = loaded_data['readme_original']['score'][2]['score']
        # readme_o_score_q3_txt = loaded_data['readme_original']['score'][2]['explanation']
        # readme_o_score_q4 = loaded_data['readme_original']['score'][3]['score']
        # readme_o_score_q4_txt = loaded_data['readme_original']['score'][3]['explanation']
        # readme_o_score_q5 = loaded_data['readme_original']['score'][4]['score']
        # readme_o_score_q5_txt = loaded_data['readme_original']['score'][4]['explanation']

    # create a dictionary with all requiered elements for the processed file
    new_entry = {
        'repo_owner': repo_owner,
        'repo_name': repo_name,
        'readme_g_score': readme_g_score,
        'readme_g_score_q1': readme_g_score_q1,
        'readme_g_score_q1_txt': readme_g_score_q1_txt,
        'readme_g_score_q2':  readme_g_score_q2,
        'readme_g_score_q2_txt':  readme_g_score_q2_txt,
        'readme_g_score_q3':  readme_g_score_q3,
        'readme_g_score_q3_txt':  readme_g_score_q3_txt,
        'readme_g_score_q4':  readme_g_score_q4,
        'readme_g_score_q4_txt':  readme_g_score_q4_txt,
        'readme_g_score_q5':  readme_g_score_q5,
        'readme_g_score_q5_txt':  readme_g_score_q5_txt,
        'readme_o_score':  readme_o_score,    
        'readme_o_score_q1':  readme_o_score_q1,
        'readme_o_score_q1_txt':  readme_o_score_q1_txt, 
        'readme_o_score_q2':  readme_o_score_q2,
        'readme_o_score_q2_txt':  readme_o_score_q2_txt,
        'readme_o_score_q3':  readme_o_score_q3,
        'readme_o_score_q3_txt':  readme_o_score_q3_txt,
        'readme_o_score_q4':  readme_o_score_q4,
        'readme_o_score_q4_txt':  readme_o_score_q4_txt,
        'readme_o_score_q5':  readme_o_score_q5,
        'readme_o_score_q5_txt':  readme_o_score_q5_txt
    }

    df_clean_scores = pd.concat([df_clean_scores, pd.DataFrame([new_entry])], ignore_index=True) # add new_entry to the existing dataframe df_clean_scores

In [None]:
len(df_clean_scores) # check length of dataframe

52

In [6]:
df_clean_scores.head(5)

Unnamed: 0,repo_owner,repo_name,readme_g_score,readme_g_score_q1,readme_g_score_q1_txt,readme_g_score_q2,readme_g_score_q2_txt,readme_g_score_q3,readme_g_score_q3_txt,readme_g_score_q4,...,readme_o_score_q1,readme_o_score_q1_txt,readme_o_score_q2,readme_o_score_q2_txt,readme_o_score_q3,readme_o_score_q3_txt,readme_o_score_q4,readme_o_score_q4_txt,readme_o_score_q5,readme_o_score_q5_txt
0,Z4nzu,hackingtool,"### ""q1"": [\n ##""score"": 4##,\n ##""expla...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Bitwise-01,Instagram-,"### ""q1"": [ \n ##""score"": 5##,\n ##""expl...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,slavfox,Cozette,"### ""q1"": [\n ##""score"": 5##,\n ##""expla...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,huggingface,alignment-handbook,"### ""q1"": [\n ##""score"": 4##,\n ##""expla...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,grapeot,devin.cursorrules,"### ""q1"": [\n ##""score"": 5##,\n ##""expla...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_model1 = df_clean_scores.copy() # create copy of df_clean_scores

In [None]:
def search_score_num(txt): # function to find score value in given text
    score_pattern = r'[sS]core\b\W*(\d)\W*\b[eE]xplanation\b' # pattern to identify the score value # (generated with Microsoft Copilot)
    match = re.search(score_pattern, txt) # search for pattern in txt

    if match: # if match true
        number = match.group(1) # extract group 1 from match
        number = int(number) # convert number into integer
        
        return(number) # return number
    else: # if match not true return '0'
        return '0'


def search_explanation(txt): # function to find score text value in given text
    score_pattern = r'\b[eE]xplanation\W*(.*)' # pattern to identify the score text # (generated with Microsoft Copilot)
    match = re.search(score_pattern, txt) # search for pattern in txt

    if match:
        explanation = match.group(1) # extract group 1 from match
        
        return(explanation) # return explanation
    else:  # if match not true return '0'
        return '0'


def clean_score(df): # function to clean scores in provided dataframe
    for idx, row in df.iterrows(): # iterate through all index and rows from dataframe
        # types = ['g', 'o'] # list with types for the first evaluation (generated readmes and orginal readmes)
        types = ['g']
        for t in types: # iterate trough list of types
            score = row[f'readme_{t}_score'] # extract score from llm
            score = score.replace('\n', '').replace('  ', '') # call replace() for cleaning
            score_list = score.split('###') # split score into list 
            if len(score_list) == 1: # check if split worked, if there are no # in the string, the next row should be processed
                continue

            num = 1 # set num to track progress
            for i in score_list: # iterate over each element (i) in score_list (score from llm split into multiple parts)
                if num == 6: # check if num is equal 6 --> there are no more scores than 5
                    continue
                if i.strip() != '': # strip() --> remove white spaces from string # check if the func strip() applied to i is unequal '' # (generated with Microsoft Copilot)
                    # search for q-digit
                    q_digit = search_score_num(txt=i) # call search_score_num()
                    q_txt = search_explanation(txt=i) # call search_explanation()

                    # save q-txt & q-digit in dataframe
                    df.at[idx, f'readme_{t}_score_q{num}'] = q_digit # save q_digit into df at specified index
                    df.at[idx, f'readme_{t}_score_q{num}_txt'] = q_txt # save q_txt into df at specified index
                    num += 1 # increase num by 1
                else:
                    continue

In [None]:
clean_score(df=df_model1) # call clean_score()

In [None]:
len(df_model1) # check length of df_model1 after cleaning of scores

52

In [None]:
df_model1.to_json('../data/df_score_jamba_mod/df_score_model1.json', orient='records') # save df_model1 in JSON file

In [None]:
df_model1[df_model1['readme_g_score_q4'] == '0'] # check if cleaning was successful

Unnamed: 0,repo_owner,repo_name,readme_g_score,readme_g_score_q1,readme_g_score_q1_txt,readme_g_score_q2,readme_g_score_q2_txt,readme_g_score_q3,readme_g_score_q3_txt,readme_g_score_q4,...,readme_o_score_q1,readme_o_score_q1_txt,readme_o_score_q2,readme_o_score_q2_txt,readme_o_score_q3,readme_o_score_q3_txt,readme_o_score_q4,readme_o_score_q4_txt,readme_o_score_q5,readme_o_score_q5_txt


In [None]:
df_model1[df_model1['repo_name'] == 'Tensorflow-Cookbook'] # check values for specific GitHub repository

Unnamed: 0,repo_owner,repo_name,readme_g_score,readme_g_score_q1,readme_g_score_q1_txt,readme_g_score_q2,readme_g_score_q2_txt,readme_g_score_q3,readme_g_score_q3_txt,readme_g_score_q4,...,readme_o_score_q1,readme_o_score_q1_txt,readme_o_score_q2,readme_o_score_q2_txt,readme_o_score_q3,readme_o_score_q3_txt,readme_o_score_q4,readme_o_score_q4_txt,readme_o_score_q5,readme_o_score_q5_txt
13,taki0112,Tensorflow-Cookbook,"### ""q1"": [\n ##""score"": 4##,\n ##""expla...",4,The README clearly states the purpose of the p...,5,The README clearly explains why the project is...,4,The README provides a clear step-by-step guide...,2,...,0,0,0,0,0,0,0,0,0,0


In [None]:
print(df_model1.loc[13, 'readme_g_score']) # print scores for specific index 

### "q1": [
    ##"score": 4##,
    ##"explanation": The README clearly states the purpose of the project, which is a TensorFlow repository for deep learning model, specifically designed for image processing and generation tasks. However, it would be even better if the goal of the project was more specific, e.g., "This project aims to provide a comprehensive collection of neural network architectures and functions for image processing and generation tasks, with a focus on convolutional neural networks (CNNs) and generative adversarial networks (GANs)".##
]

### "q2": [
    ##"score": 5##,
    ##"explanation": The README clearly explains why the project is useful, which is that it provides a collection of various neural network architectures and functions for image processing and generation tasks. This is a very good answer, as it highlights the benefits of the project and its potential applications.##
]

### "q3": [
    ##"score": 4##,
    ##"explanation": The README provides a clear s