In [3]:
import json
from sklearn.model_selection import train_test_split, StratifiedKFold

# Load the datasets
with open('./train/labels.json', 'r') as file:
    labels_data = json.load(file)
with open('./train/logs.json', 'r') as file:
    logs_data = json.load(file)
with open('./train/logs_aspect.json', 'r') as file:
    logs_aspect_data = json.load(file)
with open('./train/triple.json', 'r') as file:
    triple_data = json.load(file)

# Ensure the datasets have the same length
assert len(labels_data) == len(logs_data), "The datasets do not match in length."

# Helper function to categorize the length of the knowledge field
def knowledge_length_category(entry):
    if 'knowledge' not in entry or not entry['knowledge']:
        return 'none'
    length = len(entry['knowledge'])
    if length <= 3:
        return 'between_1_and_3'
    elif 4 <= length <= 6:
        return 'between_4_and_6'
    else:
        return 'more_than_6'

# Prepare the stratification variable based on labels_data
knowledge_categories = [knowledge_length_category(entry) for entry in labels_data]
targets = [entry['target'] for entry in labels_data]

# Generate indices for the split using stratification based on the knowledge category and the target label
train_indices, test_indices = next(StratifiedKFold(n_splits=4, shuffle=True, random_state=42).split(labels_data, knowledge_categories))

# Split both datasets using the generated indices
train_labels = [labels_data[i] for i in train_indices]
test_labels = [labels_data[i] for i in test_indices]
train_logs = [logs_data[i] for i in train_indices]
test_logs = [logs_data[i] for i in test_indices]
train_logs_aspect = [logs_aspect_data[i] for i in train_indices]
test_logs_aspect = [logs_aspect_data[i] for i in test_indices]
train_triple = [triple_data[i] for i in train_indices]
test_triple = [triple_data[i] for i in test_indices]
# Save the split datasets
with open('./Dynamic_InstructKS_Dataset_triple_based/train/labels.json', 'w') as file:
    json.dump(train_labels, file, indent=4)
with open('./Dynamic_InstructKS_Dataset_triple_based/train/logs.json', 'w') as file:
    json.dump(train_logs, file, indent=4)
with open('./Dynamic_InstructKS_Dataset_triple_based/train/logs_aspect.json', 'w') as file:
    json.dump(train_logs_aspect, file, indent=4)
with open('./Dynamic_InstructKS_Dataset_triple_based/train/triple.json', 'w') as file:
    json.dump(train_triple, file, indent=4)

with open('./Dynamic_InstructKS_Dataset_triple_based/resource/labels.json', 'w') as file:
    json.dump(test_labels, file, indent=4)
with open('./Dynamic_InstructKS_Dataset_triple_based/resource/logs.json', 'w') as file:
    json.dump(test_logs, file, indent=4)
with open('./Dynamic_InstructKS_Dataset_triple_based/resource/logs_aspect.json', 'w') as file:
    json.dump(test_logs_aspect, file, indent=4)
with open('./Dynamic_InstructKS_Dataset_triple_based/resource/triple.json', 'w') as file:
    json.dump(test_triple, file, indent=4)




In [4]:
import json

# Make sure to define the knowledge_length_category function as provided above.

def calculate_statistics(dataset):
    stats = {
        'total_entries': len(dataset),
        'knowledge_length_counts': {
            'none': 0,
            'between_1_and_3': 0,
            'between_4_and_6': 0,
            'more_than_6': 0,
        }
    }
    
    for entry in dataset:
        category = knowledge_length_category(entry)
        stats['knowledge_length_counts'][category] += 1
    
    stats['knowledge_length_proportions'] = {
        k: v / stats['total_entries'] for k, v in stats['knowledge_length_counts'].items()
    }
    
    return stats

# Load the datasets
with open('./ynamic_InstructKS_Dataset_triple_based/train/train_labels.json', 'r') as f:
    train_data = json.load(f)
with open('./ynamic_InstructKS_Dataset_triple_based/resource/labels.json', 'r') as f:
    test_data = json.load(f)

# Calculate and print the statistics
train_stats = calculate_statistics(train_data)
test_stats = calculate_statistics(test_data)

print("Training Data Statistics:", train_stats)
print("Testing Data Statistics:", test_stats)


Training Data Statistics: {'total_entries': 21323, 'knowledge_length_counts': {'none': 10247, 'between_1_and_3': 5909, 'between_4_and_6': 3911, 'more_than_6': 1256}, 'knowledge_length_proportions': {'none': 0.4805608966843315, 'between_1_and_3': 0.2771186043239694, 'between_4_and_6': 0.18341696759367818, 'more_than_6': 0.05890353139802092}}
Testing Data Statistics: {'total_entries': 7108, 'knowledge_length_counts': {'none': 3416, 'between_1_and_3': 1970, 'between_4_and_6': 1304, 'more_than_6': 418}, 'knowledge_length_proportions': {'none': 0.4805852560495217, 'between_1_and_3': 0.2771525042205965, 'between_4_and_6': 0.18345526167698367, 'more_than_6': 0.05880697805289814}}


In [5]:
with open('./train/labels.json', 'r') as file:
    labels_data = json.load(file)
print(len(labels_data))

28431
