In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
def read_and_combine_data(data_pattern, data_type):
    """
    Reads and combines data from multiple files into a single DataFrame with a label column.

    Args:
        data_pattern (str): Type of data ('linearlySeparable', 'nonLinearlySeparable', or 'overlapping').
        data_type (str): Type of data ('train', 'test', or 'val').

    Returns:
        pd.DataFrame: Combined DataFrame with a 'Label' column.
    """
    dfs = []

    data_dir = os.path.join('data', data_pattern)
    i = 1
    for file_name in os.listdir(data_dir):
        file_dir = os.path.join(data_dir, file_name)
        
        if data_type in file_name:    
            label = f'Class {i}'
            i+=1
            if file_name.endswith('.txt'):
                df = pd.read_csv(file_dir, delim_whitespace=True, names=['X', 'Y'])
                df['Label'] = label
                dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

In [3]:
def save_combined_data_to_csv(data_pattern, data_type, output_filename):
    """
    Reads and combines data, then saves the combined DataFrame to a CSV file.

    Args:
        data_pattern (str): Type of data ('linearlySeparable', 'nonLinearlySeparable', or 'overlapping').
        data_type (str): Type of data ('train', 'test', or 'val').
        output_filename (str): Name of the output CSV file.
    """
    combined_df = read_and_combine_data(data_pattern, data_type)
    output_filepath = os.path.join('combined_data', output_filename)
    combined_df.to_csv(output_filepath, index=False)
    print(f'Data saved to {output_filepath}')

In [4]:
data_patterns = ['linearlySeparable', 'linearlySeparable', 'linearlySeparable',
                 'nonLinearlySeparable', 'nonLinearlySeparable', 'nonLinearlySeparable',
                 'overlapping', 'overlapping', 'overlapping']
data_types = ['train', 'val', 'test',
              'train', 'val', 'test',
              'train', 'val', 'test']
output_filenames = ['linear_train_data.csv', 'linear_val_data.csv', 'linear_test_data.csv', 
                    'non_linear_train_data.csv', 'non_linear_val_data.csv', 'non_linear_test_data.csv', 
                    'overlapping_train_data.csv', 'overlapping_val_data.csv', 'overlapping_test_data.csv']


for i,j,k in zip(data_patterns, data_types, output_filenames):
    save_combined_data_to_csv(i, j, k)

Data saved to combined_data\linear_train_data.csv
Data saved to combined_data\linear_val_data.csv
Data saved to combined_data\linear_test_data.csv
Data saved to combined_data\non_linear_train_data.csv
Data saved to combined_data\non_linear_val_data.csv
Data saved to combined_data\non_linear_test_data.csv
Data saved to combined_data\overlapping_train_data.csv
Data saved to combined_data\overlapping_val_data.csv
Data saved to combined_data\overlapping_test_data.csv
