# Summary of the Dataset #

In [1]:
import pandas as pd
import os

# Define dataset paths
dataset_dir = "../dataset/"  # Replace with the actual dataset directory
datasets = ["train", "test", "dev"]

# Dictionary to store dataset summaries
summary = {}

# Process each dataset
for dataset in datasets:
    file_path = dataset_dir + dataset + f"/subtask_a_{dataset}.tsv"  # Adjust filename if needed
    
    # Read the TSV file
    df = pd.read_csv(file_path, sep='\t')
    
    # Compute required information
    dataset_length = len(df)
    sentence_type_distribution = df['sentence_type'].value_counts().to_dict()
    
    # Store results
    summary[dataset] = {
        "dataset_length": dataset_length,
        "sentence_type_distribution": sentence_type_distribution
    }

# Print summary
for dataset, stats in summary.items():
    print(f"Dataset: {dataset}")
    print(f"  Total samples: {stats['dataset_length']}")
    print(f"  Sentence type distribution: {stats['sentence_type_distribution']}")
    print("-" * 60)


Dataset: train
  Total samples: 70
  Sentence type distribution: {'idiomatic': 39, 'literal': 31}
------------------------------------------------------------
Dataset: test
  Total samples: 15
  Sentence type distribution: {'idiomatic': 8, 'literal': 7}
------------------------------------------------------------
Dataset: dev
  Total samples: 15
  Sentence type distribution: {'literal': 8, 'idiomatic': 7}
------------------------------------------------------------
