# 1. Data Import & Review

## Overview
This notebook loads and explores the **COGS, PropBank, and Universal Dependencies datasets**.

## Checkpoints
- 1A: Dataset format validation
- 1B: Exploratory statistics & visualizations


In [1]:
# Install required libraries (if not installed)
!pip install datasets pandas matplotlib seaborn

# Import standard libraries
import pandas as pd
import os

# Define path to COGS dataset
cogs_path = "content/meta-semantic-research/data/COGS"

# Load dataset splits
train_path = os.path.join(cogs_path, "train.tsv")
dev_path = os.path.join(cogs_path, "dev.tsv")
test_path = os.path.join(cogs_path, "test.tsv")
gen_path = os.path.join(cogs_path, "gen.tsv")

# Read files
train_df = pd.read_csv(train_path, sep="\t", names=["Sentence", "LogicalForm"])
dev_df = pd.read_csv(dev_path, sep="\t", names=["Sentence", "LogicalForm"])
test_df = pd.read_csv(test_path, sep="\t", names=["Sentence", "LogicalForm"])
gen_df = pd.read_csv(gen_path, sep="\t", names=["Sentence", "LogicalForm"])

# Display sample rows
train_df.head()

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

FileNotFoundError: [Errno 2] No such file or directory: 'content/meta-semantic-research/data/COGS/train.tsv'

In [None]:
# Display dataset sizes
dataset_sizes = {
    "Train": len(train_df),
    "Dev": len(dev_df),
    "Test": len(test_df),
    "Generalization": len(gen_df)
}
dataset_sizes


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Compute sentence length
train_df["SentenceLength"] = train_df["Sentence"].apply(lambda x: len(str(x).split()))
gen_df["SentenceLength"] = gen_df["Sentence"].apply(lambda x: len(str(x).split()))

# Plot distribution
plt.figure(figsize=(10,5))
sns.histplot(train_df["SentenceLength"], label="Train", kde=True, bins=30)
sns.histplot(gen_df["SentenceLength"], label="Generalization", kde=True, bins=30)
plt.xlabel("Sentence Length")
plt.ylabel("Frequency")
plt.legend()
plt.title("Sentence Length Distribution: Train vs Generalization")
plt.show()


In [None]:
# Compare logical form lengths
train_df["LogicalFormLength"] = train_df["LogicalForm"].apply(lambda x: len(str(x).split()))
gen_df["LogicalFormLength"] = gen_df["LogicalForm"].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(10,5))
sns.histplot(train_df["LogicalFormLength"], label="Train", kde=True, bins=30)
sns.histplot(gen_df["LogicalFormLength"], label="Generalization", kde=True, bins=30)
plt.xlabel("Logical Form Length")
plt.ylabel("Frequency")
plt.legend()
plt.title("Logical Form Length Distribution")
plt.show()
