# NLP Assignment 2 

In [17]:
import pandas as pd
import json
import os
import csv

## Loading the dataset

### ATIS

In [13]:

splits = {
    'train': 'atis_train.csv',
    'test': 'atis_test.csv'
}

In [14]:
atis_train = pd.read_csv("hf://datasets/tuetschek/atis/" + splits["train"])
atis_test = pd.read_csv("hf://datasets/tuetschek/atis/" + splits["test"])

print("ATIS Train shape:", atis_train.shape)
print("ATIS Test shape:", atis_test.shape)
print(atis_train.head())

  from .autonotebook import tqdm as notebook_tqdm


ATIS Train shape: (4978, 4)
ATIS Test shape: (893, 4)
   id       intent                                               text  \
0   0       flight  i want to fly from boston at 838 am and arrive...   
1   1       flight  what flights are available from pittsburgh to ...   
2   2  flight_time  what is the arrival time in san francisco for ...   
3   3      airfare            cheapest airfare from tacoma to orlando   
4   4      airfare  round trip fares from pittsburgh to philadelph...   

                                               slots  
0  O O O O O B-fromloc.city_name O B-depart_time....  
1  O O O O O B-fromloc.city_name O B-toloc.city_n...  
2  O O O B-flight_time I-flight_time O B-fromloc....  
3  B-cost_relative O O B-fromloc.city_name O B-to...  
4  B-round_trip I-round_trip O O B-fromloc.city_n...  


### Saving the data locally

In [16]:

os.makedirs("Data/ATIS", exist_ok=True)

# Save ATIS datasets to CSV files
atis_train.to_csv("Data/ATIS/atis_train.csv", index=False)
atis_test.to_csv("Data/ATIS/atis_test.csv", index=False)

print("ATIS datasets saved successfully!")
print("- Train dataset saved to: Data/ATIS/atis_train.csv")
print("- Test dataset saved to: Data/ATIS/atis_test.csv")

ATIS datasets saved successfully!
- Train dataset saved to: Data/ATIS/atis_train.csv
- Test dataset saved to: Data/ATIS/atis_test.csv


### SLURP

In [None]:

def load_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data


In [5]:

slurp_train = load_jsonl("slurp_1/train.jsonl")
slurp_dev = load_jsonl("slurp_1/devel.jsonl")
slurp_test = load_jsonl("slurp_1/test.jsonl")
slurp_train_synth = load_jsonl("slurp_1/train_synthetic.jsonl")

print("SLURP Train size:", len(slurp_train))
print("Example SLURP entry:", slurp_train[0])

SLURP Train size: 11514
Example SLURP entry: {'slurp_id': 9024, 'sentence': 'event', 'sentence_annotation': 'event', 'intent': 'calendar_set', 'action': 'set', 'tokens': [{'surface': 'event', 'id': 0, 'lemma': 'event', 'pos': 'NN'}], 'scenario': 'calendar', 'recordings': [{'file': 'audio-1501754435.flac', 'wer': 0.0, 'ent_wer': None, 'status': 'correct'}, {'file': 'audio-1501407267-headset.flac', 'wer': 0.0, 'ent_wer': None, 'status': 'correct'}, {'file': 'audio-1501407267.flac', 'wer': 0.0, 'ent_wer': None, 'status': 'correct'}, {'file': 'audio-1501771798-headset.flac', 'wer': 0.0, 'ent_wer': None, 'status': 'correct'}, {'file': 'audio-1501771798.flac', 'wer': 0.0, 'ent_wer': None, 'status': 'correct'}, {'file': 'audio-1490705711-headset.flac', 'wer': 0.0, 'ent_wer': None, 'status': 'correct'}, {'file': 'audio-1490705711.flac', 'wer': 0.0, 'ent_wer': None, 'status': 'correct'}, {'file': 'audio-1494416970-headset.flac', 'wer': 0.0, 'ent_wer': None, 'status': 'correct'}, {'file': 'audio

### removing the excess fields from the data and converting entities to BIO strings for slot prediction

### To convert entities field to BIO String

In [18]:
def entities_to_bio(tokens, entities):
    """
    tokens: list of token dicts, each has "surface" etc.
    entities: list of dicts, each with "span" (list of token indices) and "type"
    Returns: list of BIO slot labels (strings), same length as tokens
    """
    T = len(tokens)
    bio = ["O"] * T
    for ent in entities:
        span = ent["span"]
        stype = ent["type"]
        if not span:
            continue
        # First token in span gets B-
        first = span[0]
        bio[first] = f"B-{stype}"
        # If span has more tokens, label them I-
        for idx in span[1:]:
            bio[idx] = f"I-{stype}"
    return bio

### Preproces raw slurp jsons to csv with BIO strings for slot prediction

In [19]:
def process_slurp_jsonl_to_csv(input_jsonl_paths, output_csv_path):
    """
    input_jsonl_paths: list of paths (train, dev, test, etc.)
    output_csv_path: path to write the combined CSV
    """
    
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

    
    csv_columns = ["slurp_id", "sentence", "intent", "entities", "slot_string"]
    
   
    with open(output_csv_path, 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        
        for jp in input_jsonl_paths:
            data = load_jsonl(jp)
            for rec in data:
                slurp_id = rec.get("slurp_id")
                sentence = rec.get("sentence")
                intent = rec.get("intent")
                entities = rec.get("entities", [])
                tokens = rec.get("tokens", [])
                
                # Build BIO slot string
                bio_labels = entities_to_bio(tokens, entities)
                # Join them with space (or some delimiter)
                slot_string = " ".join(bio_labels)
                
                # Prepare minimal record
                out = {
                    "slurp_id": slurp_id,
                    "sentence": sentence,
                    "intent": intent,
                    # We convert entities list to JSON string so we can store in CSV
                    "entities": json.dumps(entities, ensure_ascii=False),
                    "slot_string": slot_string
                }
                writer.writerow(out)

    print("Saved to CSV:", output_csv_path)

In [20]:
mapping = [
        ("slurp_1/train.jsonl",   "Data/SLURP/slurp_train.csv"),
        ("slurp_1/devel.jsonl",   "Data/SLURP/slurp_dev.csv"),
        ("slurp_1/test.jsonl",    "Data/SLURP/slurp_test.csv"),
        ("slurp_1/train_synthetic.jsonl", "Data/SLURP/slurp_train_synth.csv"),
    ]


In [21]:
for in_path, out_csv in mapping:
    process_slurp_jsonl_to_csv([in_path], out_csv)

Saved to CSV: Data/SLURP/slurp_train.csv
Saved to CSV: Data/SLURP/slurp_dev.csv
Saved to CSV: Data/SLURP/slurp_test.csv
Saved to CSV: Data/SLURP/slurp_train_synth.csv
