In [1]:
from collections import Counter

from datasets import load_dataset

In [2]:
dataset = load_dataset("tuetschek/atis")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'intent', 'text', 'slots'],
        num_rows: 4978
    })
    test: Dataset({
        features: ['id', 'intent', 'text', 'slots'],
        num_rows: 893
    })
})

In [5]:
labels = list(set([example["intent"] for ds in dataset.values() for example in ds]))
labels.sort()
labels

['abbreviation',
 'aircraft',
 'aircraft+flight+flight_no',
 'airfare',
 'airfare+flight',
 'airfare+flight_time',
 'airline',
 'airline+flight_no',
 'airport',
 'capacity',
 'cheapest',
 'city',
 'day_name',
 'distance',
 'flight',
 'flight+airfare',
 'flight+airline',
 'flight_no',
 'flight_no+airline',
 'flight_time',
 'ground_fare',
 'ground_service',
 'ground_service+ground_fare',
 'meal',
 'quantity',
 'restriction']

In [6]:
labels_count = Counter([example["intent"] for ds in dataset.values() for example in ds])
labels_count

Counter({'flight': 4298,
         'airfare': 471,
         'ground_service': 291,
         'airline': 195,
         'abbreviation': 180,
         'aircraft': 90,
         'flight_time': 55,
         'quantity': 54,
         'airport': 38,
         'capacity': 37,
         'flight+airfare': 33,
         'distance': 30,
         'ground_fare': 25,
         'city': 25,
         'flight_no': 20,
         'meal': 12,
         'restriction': 6,
         'airline+flight_no': 2,
         'day_name': 2,
         'ground_service+ground_fare': 1,
         'airfare+flight_time': 1,
         'cheapest': 1,
         'aircraft+flight+flight_no': 1,
         'airfare+flight': 1,
         'flight+airline': 1,
         'flight_no+airline': 1})

In [8]:
labels_count = Counter(
    [
        intent
        for ds in dataset.values()
        for example in ds
        for intent in example["intent"].split("+")
    ]
)
labels_count

Counter({'flight': 4334,
         'airfare': 506,
         'ground_service': 292,
         'airline': 199,
         'abbreviation': 180,
         'aircraft': 91,
         'flight_time': 56,
         'quantity': 54,
         'airport': 38,
         'capacity': 37,
         'distance': 30,
         'ground_fare': 26,
         'city': 25,
         'flight_no': 24,
         'meal': 12,
         'restriction': 6,
         'day_name': 2,
         'cheapest': 1})

In [10]:
list(labels_count.keys())

['flight',
 'flight_time',
 'airfare',
 'aircraft',
 'ground_service',
 'airport',
 'airline',
 'distance',
 'abbreviation',
 'ground_fare',
 'quantity',
 'city',
 'flight_no',
 'capacity',
 'meal',
 'restriction',
 'cheapest',
 'day_name']

In [11]:
[example for example in dataset["train"] if "+" in example["intent"]]

[{'id': 570,
  'intent': 'flight+airfare',
  'text': 'give me the flights and fares on december twenty seventh from indianapolis to orlando',
  'slots': 'O O O O O O O B-depart_date.month_name B-depart_date.day_number I-depart_date.day_number O B-fromloc.city_name O B-toloc.city_name'},
 {'id': 602,
  'intent': 'flight+airfare',
  'text': 'all flights and fares from atlanta to dallas round trip after 12 pm less than 1100 dollars',
  'slots': 'O O O O O B-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip B-depart_time.time_relative B-depart_time.time I-depart_time.time B-cost_relative O B-fare_amount I-fare_amount'},
 {'id': 719,
  'intent': 'flight+airfare',
  'text': 'first flights and fares from pittsburgh to atlanta on a thursday',
  'slots': 'B-flight_mod O O O O B-fromloc.city_name O B-toloc.city_name O O B-depart_date.day_name'},
 {'id': 859,
  'intent': 'flight+airfare',
  'text': 'all flights and fares from atlanta to dallas round trip after 12 pm less than 1100 d

In [12]:
[example for example in dataset["train"] if "flight_no" in example["intent"]]

[{'id': 330,
  'intent': 'flight_no',
  'text': 'flight numbers from columbus to minneapolis tomorrow',
  'slots': 'O O O B-fromloc.city_name O B-toloc.city_name B-depart_date.today_relative'},
 {'id': 879,
  'intent': 'flight_no',
  'text': "i 'm trying to find the flight number from a flight from orlando to cleveland on us air and it arrives around 10 pm",
  'slots': 'O O O O O O O O O O O O B-fromloc.city_name O B-toloc.city_name O B-airline_name I-airline_name O O O B-arrive_time.time_relative B-arrive_time.time I-arrive_time.time'},
 {'id': 961,
  'intent': 'flight_no',
  'text': 'flight numbers from minneapolis to long beach on june twenty six',
  'slots': 'O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number I-depart_date.day_number'},
 {'id': 1255,
  'intent': 'flight_no',
  'text': 'please show me the return flight number from toronto to st. petersburg',
  'slots': 'O O O O O O O O B-fromloc.city_name O B-toloc.cit

## Slots

In [17]:
entities = sorted(
    set([slot[2:] for ds in dataset.values() for example in ds for slot in example["slots"].split() if slot != "O"])
)
entities

['aircraft_code',
 'airline_code',
 'airline_name',
 'airport_code',
 'airport_name',
 'arrive_date.date_relative',
 'arrive_date.day_name',
 'arrive_date.day_number',
 'arrive_date.month_name',
 'arrive_date.today_relative',
 'arrive_time.end_time',
 'arrive_time.period_mod',
 'arrive_time.period_of_day',
 'arrive_time.start_time',
 'arrive_time.time',
 'arrive_time.time_relative',
 'booking_class',
 'city_name',
 'class_type',
 'compartment',
 'connect',
 'cost_relative',
 'day_name',
 'day_number',
 'days_code',
 'depart_date.date_relative',
 'depart_date.day_name',
 'depart_date.day_number',
 'depart_date.month_name',
 'depart_date.today_relative',
 'depart_date.year',
 'depart_time.end_time',
 'depart_time.period_mod',
 'depart_time.period_of_day',
 'depart_time.start_time',
 'depart_time.time',
 'depart_time.time_relative',
 'economy',
 'fare_amount',
 'fare_basis_code',
 'flight',
 'flight_days',
 'flight_mod',
 'flight_number',
 'flight_stop',
 'flight_time',
 'fromloc.airport_

In [18]:
entities = sorted(
    set(
        [
            slot[2:].split(".")[0]
            for ds in dataset.values()
            for example in ds
            for slot in example["slots"].split()
            if slot != "O"
        ]
    )
)
entities

['aircraft_code',
 'airline_code',
 'airline_name',
 'airport_code',
 'airport_name',
 'arrive_date',
 'arrive_time',
 'booking_class',
 'city_name',
 'class_type',
 'compartment',
 'connect',
 'cost_relative',
 'day_name',
 'day_number',
 'days_code',
 'depart_date',
 'depart_time',
 'economy',
 'fare_amount',
 'fare_basis_code',
 'flight',
 'flight_days',
 'flight_mod',
 'flight_number',
 'flight_stop',
 'flight_time',
 'fromloc',
 'meal',
 'meal_code',
 'meal_description',
 'mod',
 'month_name',
 'or',
 'period_of_day',
 'restriction_code',
 'return_date',
 'return_time',
 'round_trip',
 'state_code',
 'state_name',
 'stoploc',
 'time',
 'time_relative',
 'today_relative',
 'toloc',
 'transport_type']

In [21]:
entities = Counter(
    [
        slot[2:].split(".")[0]
        for ds in dataset.values()
        for example in ds
        for slot in example["slots"].split()
        if slot != "O"
    ]
)
entities

Counter({'toloc': 6771,
         'fromloc': 6290,
         'depart_date': 2357,
         'depart_time': 2038,
         'airline_name': 1293,
         'arrive_time': 832,
         'round_trip': 831,
         'class_type': 441,
         'cost_relative': 439,
         'flight_mod': 370,
         'city_name': 364,
         'stoploc': 323,
         'arrive_date': 226,
         'flight_stop': 206,
         'airline_code': 170,
         'airport_name': 142,
         'fare_amount': 108,
         'flight_time': 105,
         'flight_number': 96,
         'fare_basis_code': 95,
         'transport_type': 82,
         'or': 69,
         'meal_description': 68,
         'aircraft_code': 64,
         'meal': 62,
         'economy': 52,
         'flight_days': 49,
         'connect': 46,
         'restriction_code': 44,
         'airport_code': 38,
         'mod': 33,
         'return_date': 33,
         'state_name': 12,
         'meal_code': 11,
         'state_code': 9,
         'period_of_day': 

In [22]:
list(entities.keys())

['fromloc',
 'depart_time',
 'toloc',
 'arrive_time',
 'depart_date',
 'flight_time',
 'cost_relative',
 'round_trip',
 'fare_amount',
 'city_name',
 'stoploc',
 'class_type',
 'airline_name',
 'mod',
 'fare_basis_code',
 'transport_type',
 'flight_mod',
 'arrive_date',
 'meal',
 'meal_description',
 'return_date',
 'airline_code',
 'flight_stop',
 'time',
 'or',
 'economy',
 'flight_number',
 'flight_days',
 'state_code',
 'airport_code',
 'aircraft_code',
 'connect',
 'restriction_code',
 'airport_name',
 'days_code',
 'day_name',
 'period_of_day',
 'today_relative',
 'meal_code',
 'state_name',
 'time_relative',
 'return_time',
 'month_name',
 'day_number',
 'compartment',
 'booking_class',
 'flight']