In [1]:
import ast
import os
import random
import pandas as pd

from datasets import Dataset, load_dataset

In [2]:
def read_files_to_dict(directory):
    data_dict = {}
    
    # Walk through directory and subdirectories
    for root, dirs, files in os.walk(directory):
        for filename in files:
            # Check for CSV files
            if filename.lower().endswith('.csv'):
                file_path = os.path.join(root, filename)
                
                try:
                    # Read the file into a DataFrame
                    df = pd.read_csv(file_path, sep=',', quotechar="'", low_memory=False)
                    # Store DataFrame in the dictionary with the relative file path as the key
                    relative_path = os.path.relpath(file_path, directory)
                    data_dict[relative_path] = df
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    
    return data_dict

# Define the directory containing the CSV files
directory_path = "D:/Projects/TestMap/TestMap/Output/"

# Read all files and store in dictionary
data_dict = read_files_to_dict(directory_path)

In [3]:
def combine_csvs(data_dict, keyword):
    # Filter DataFrames based on the keyword in the key
    filtered_dfs = [df for key, df in data_dict.items() if keyword in key]
    
    if filtered_dfs:
        # Concatenate all filtered DataFrames
        combined_df = pd.concat(filtered_dfs, ignore_index=True)
        # Write combined DataFrame to a CSV file
        return combined_df
    else:
        print(f"No files found with keyword '{keyword}'")
        
# Combine all CSVs with 'test_method' in the key and write to a single CSV
test_methods_df = combine_csvs(data_dict, 'test_methods')

# Combine all CSVs with 'test_class' in the key and write to a single CSV
test_classes_df = combine_csvs(data_dict, 'test_classes')

## Initial Formatting

### Test Method Formatting

The CSV format from CSharp needs to formatted for Python.

Test Method records are different than the Test Class records.

They need to formatted differently.

#### Converting Fields to List of Strings

Our list of fields from the testing class is separated with `<<SEP>>`.

In [4]:
def convert_class_fields_to_list(s):
    try:
        if isinstance(s, str):
            s = s.replace("<<NEWLINE>>", "\n")
            s = s.replace("<<SINGLE-QUOTE>>", "\'")
            temp_list = []
            substrs = s.split("<<SEP>>")
            
            for substr in substrs:
                temp_list.append(substr)
                
            return temp_list
        else:
            return []
        
    except (ValueError, SyntaxError) as e:
        print(f"Error converting string to list {e}")
        return []

# Apply the function to the column
test_methods_df['ClassFields'] = test_methods_df['ClassFields'].apply(convert_class_fields_to_list)

#### Converting Using Statements to List of Strings

In [5]:
def convert_usings_to_list(s):
    try:
        if isinstance(s, str):
            s = s.replace("<<NEWLINE>>", "\n")
            s = s.replace("<<SINGLE-QUOTE>>", "\'")
            temp_list = []
            substrs = s.split("<<SEP>>")
            
            for substr in substrs:
                temp_list.append(substr)
                
            return temp_list
        else:
            return []
        
    except (ValueError, SyntaxError) as e:
        print(f"Error converting string to list {e}")
        return []

# Apply the function to the column
test_methods_df['UsingStatements'] = test_methods_df['UsingStatements'].apply(convert_class_fields_to_list)

#### Converting Method Invocations to a List of Tuples

When creating the CSV from the original program, we had a list of tuples in CSharp.

CSharp doesn't print lists to strings like Python would. 

So we had to add keywords and special formatting so we
could convert to a list that Python would understand.

In [6]:
# Function to add brackets and convert to list of tuples
def convert_method_invocations(s):
    try:
        s = s.replace("<<NEWLINE>>", "\n")
        s = s.replace("<<SINGLE-QUOTE>>", "\'")
        temp_list = []
        substrs = s.split("<<SEP>>")
        
        for substr in substrs:
            substr = substr.lstrip("(").rstrip(")")
            sub = substr.split('<<TUPLE>>')
            if sub[-1] != ' ':
                tup = (sub[0].rstrip(', '), sub[-1])
                temp_list.append(tup)
        return temp_list
        
    except (ValueError, SyntaxError) as e:
        print(f"Error converting string to list of tuples: {e}")
        return []

# Apply the function to the column
test_methods_df['MethodInvocations'] = test_methods_df['MethodInvocations'].apply(convert_method_invocations)

#### Formatting Test Method

In [7]:
# Function to add brackets and convert to list of tuples
def format_test_method(s):
    try:
        str = s.replace("<<NEWLINE>>", "\n")
        str = str.replace("<<SINGLE-QUOTE>>", "\'")
        
        return str
        
    except (ValueError, SyntaxError) as e:
        print(f"Error converting string to list of tuples: {e}")
        return ""

# Apply the function to the column
test_methods_df['MethodBody'] = test_methods_df['MethodBody'].apply(convert_method_invocations)

### Test Class Formatting

#### Converting Fields to List of Strings

Our list of fields from the testing class is separated with `<<SEP>>`.

In [8]:
def convert_class_fields_to_list(s):
    try:
        if isinstance(s, str):
            s = s.replace("<<NEWLINE>>", "\n")
            s = s.replace("<<SINGLE-QUOTE>>", "\'")
            temp_list = []
            substrs = s.split("<<SEP>>")
            
            for substr in substrs:
                temp_list.append(substr)
                
            return temp_list
        else:
            return []
        
    except (ValueError, SyntaxError) as e:
        print(f"Error converting string to list {e}")
        return []

# Apply the function to the column
test_classes_df['ClassFields'] = test_classes_df['ClassFields'].apply(convert_class_fields_to_list)

#### Converting Using Statements to List of Strings

In [9]:
def convert_usings_to_list(s):
    try:
        if isinstance(s, str):
            s = s.replace("<<NEWLINE>>", "\n")
            s = s.replace("<<SINGLE-QUOTE>>", "\'")
            temp_list = []
            substrs = s.split("<<SEP>>")
            
            for substr in substrs:
                temp_list.append(substr)
                
            return temp_list
        else:
            return []
        
    except (ValueError, SyntaxError) as e:
        print(f"Error converting string to list {e}")
        return []

# Apply the function to the column
test_classes_df['UsingStatements'] = test_classes_df['UsingStatements'].apply(convert_class_fields_to_list)

#### Format Code

In [10]:
# Function to add brackets and convert to list of tuples
def format_code(s):
    s = str(s)
    try:
        st = s.replace("<<NEWLINE>>", "\n")
        st = st.replace("<<SINGLE-QUOTE>>", "\'")

        return st
        
    except (ValueError, SyntaxError) as e:
        print(f"Error converting string to list of tuples: {e}")
        return ""

# Apply the function to the column
test_classes_df['ClassBody'] = test_classes_df['ClassBody'].apply(format_code)
test_classes_df['SourceBody'] = test_classes_df['SourceBody'].astype(str).apply(format_code)

### Remove Any Empties

In [11]:
test_methods_df_filtered = test_methods_df[test_methods_df['MethodInvocations'].apply(lambda x: len(x) > 0)]
test_classes_df_filtered = test_classes_df[test_classes_df['SourceBody'].apply(lambda x: len(x) > 0)]

In [12]:
# drop duplicates
df_methods_dropped = test_methods_df_filtered.drop_duplicates(subset=['MethodBody'])
df_classes_dropped = test_classes_df_filtered.drop_duplicates(subset=['ClassBody'])

In [13]:
# Find duplicates based on specific columns
test_methods_dup = df_methods_dropped.duplicated(subset=['MethodBody'])
test_classes_dup = df_classes_dropped.duplicated(subset=['ClassBody'])

# Count the number of duplicate rows based on specified columns
num_method_duplicates_based_on_columns = test_methods_dup.sum()
num_class_duplicates_based_on_columns = test_classes_dup.sum()


print(f"Number of duplicate rows based on specific columns: {num_method_duplicates_based_on_columns}")
print(f"Number of duplicate rows based on specific columns: {num_class_duplicates_based_on_columns}")

Number of duplicate rows based on specific columns: 0
Number of duplicate rows based on specific columns: 0


In [14]:

df_methods_dropped.to_csv(os.path.join("data", "test_methods_full.csv"), index=False)
df_classes_dropped.to_csv(os.path.join("data", "test_classes_full.csv"), index=False)

df_test_method_full = pd.read_csv(os.path.join("data", "test_methods_full.csv"))
df_test_classes_full = pd.read_csv(os.path.join("data", "test_classes_full.csv"))

In [15]:
original_method_dataset = Dataset.from_pandas(df_test_method_full)

In [16]:
# Upload to huggingface
# upload originals
test_method_original_name = "Distaste1194/CSharpTestMethods"
original_method_dataset.push_to_hub(test_method_original_name, private=True)

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6747653d-10373feb1ee5b0a76741f33e;f85c0f65-79bb-49ce-b2e4-417b7b7f1057)

Invalid username or password.

### Class Level

In [92]:

original_class_dataset = Dataset.from_pandas(df_test_classes_full)

In [93]:
# Upload to huggingface
# upload originals
test_class_original_name = "Distaste1194/CSharpTestClasses"
original_class_dataset.push_to_hub(test_class_original_name, private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Distaste1194/csharp_test_classes_formatted_validation/commit/618acbda044b6bf1619d210a739774417f8ad707', commit_message='Upload dataset', commit_description='', oid='618acbda044b6bf1619d210a739774417f8ad707', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Distaste1194/csharp_test_classes_formatted_validation', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Distaste1194/csharp_test_classes_formatted_validation'), pr_revision=None, pr_num=None)