<a href="https://colab.research.google.com/github/elephanti/NLPProject2024/blob/main/Dataset_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Datasets

In [10]:
import pickle
import os
from sklearn.model_selection import train_test_split
import pandas as pd
from openai import AzureOpenAI
import random
import json
from dotenv import load_dotenv

load_dotenv()

True

## Utils

In [3]:
def sample_subset(df, n):
    """
    Given a dataframe with label, text and description columns,
    samples up to n samples for each label to a subset of the dataset.
    """
    return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)

def create_json_from_df(df, n):
    json_list = []
    
    grouped = df.groupby('label')
    
    for label, group in grouped:
        texts = group['text'].tolist()
        
        selected_texts = random.sample(texts, min(n, len(texts)))
        
        json_dict = {'intent': label}
        
        for i, text in enumerate(selected_texts, 1):
            key = f'example_{i}'
            json_dict[key] = text
        
        json_list.append(json_dict)
    
    json_output = json.dumps(json_list, indent=4)
    
    return json_output


## ATIS

Pickle files can be downloaded from https://www.kaggle.com/code/siddhadev/atis-dataset-from-ms-cntk

In [4]:
!wget https://github.com/elephanti/NLPProject2024/raw/main/atis.test.pkl -P inputs/ATIS
!wget https://github.com/elephanti/NLPProject2024/raw/main/atis.train.pkl -P inputs/ATIS

zsh:1: command not found: wget
zsh:1: command not found: wget


### Load dataset

In [5]:
def load_atis(fname, exclude_joint_categories=False):
    with open(fname, 'rb') as stream:
        ds,dicts = pickle.load(stream)
    print('Done  loading: ', fname)
    print('      samples: {:4d}'.format(len(ds['query'])))
    print('   vocab_size: {:4d}'.format(len(dicts['token_ids'])))
    print('   slot count: {:4d}'.format(len(dicts['slot_ids'])))
    print(' intent count: {:4d}'.format(len(dicts['intent_ids'])))

    t2i, s2i, in2i = map(dicts.get, ['token_ids', 'slot_ids','intent_ids'])
    i2t, i2s, i2in = map(lambda d: {d[k]:k for k in d.keys()}, [t2i,s2i,in2i])
    query, slots, intent =  map(ds.get, ['query', 'slot_labels', 'intent_labels'])

    samples = []

    for i in range(len(query)):
        label = i2in[intent[i][0]]

        if exclude_joint_categories and "+" in label:
            continue
        text = list(map(i2t.get, query[i]))

        # Filter BOS and EOS
        if text[0] == 'BOS' and text[-1] == 'EOS':
            text = text[1:-1]

        samples.append({
            "label": label,
            "text": ' '.join(text),
            "description": None
        })

    # save samples csv
    df = pd.DataFrame(samples)
    return df

In [11]:
def description_generator(data, n_data_examples=1):
  OPENAI_ENDPOINT = os.environ.get("OPENAI_ENDPOINT")
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
  OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")

  client = AzureOpenAI(
    azure_endpoint=OPENAI_ENDPOINT,
    api_key=OPENAI_API_KEY,
    api_version="2024-02-15-preview"
  )

  data_json = create_json_from_df(data, n_data_examples)

  system_message = f"""
    You need to create an intent descriptions for each intent with provided sentence examples from the 'Data' below.
    Use your knowled the intent examples to create the description.

    Data: {data_json}.

    The output needs to be just a json where the key is the intent and the value is the description.

    Please provide only the json output without any comments.
    """

  response = client.chat.completions.create(
      model=OPENAI_DEPLOYMENT,
      temperature=0,
      seed=42,
      max_tokens=4095,
      messages=[
          {"role": "system", "content": system_message}
      ]
  )

  descriptions = response.choices[0].message.content
  descriptions = descriptions.replace('```', '')
  descriptions = descriptions.replace('json', '')

  return json.loads(descriptions)

In [12]:
train_df = load_atis(os.path.join('inputs/ATIS','atis.train.pkl'), True)
test_df = load_atis(os.path.join('inputs/ATIS','atis.test.pkl'), True)
atis_df = pd.concat([train_df, test_df])

os.makedirs('datasets/ATIS', exist_ok=True)
atis_df.to_csv('datasets/ATIS/atis.full.csv', index=False)

# add descriptions
descriptions = description_generator(atis_df, n_data_examples=5)
atis_df['description'] = atis_df['label'].map(descriptions)

# split to train 80%, validation 10% test 10%
atis_train_df, atis_test_df = train_test_split(atis_df, test_size=0.2)
atis_valid_df, atis_test_df = train_test_split(atis_test_df, test_size=0.5)

atis_train_df.to_csv('datasets/ATIS/atis.train.csv', index=False)
atis_valid_df.to_csv('datasets/ATIS/atis.valid.csv', index=False)
atis_test_df.to_csv('datasets/ATIS/atis.test.csv', index=False)

Done  loading:  inputs/ATIS/atis.train.pkl
      samples: 4978
   vocab_size:  943
   slot count:  129
 intent count:   26
Done  loading:  inputs/ATIS/atis.test.pkl
      samples:  893
   vocab_size:  943
   slot count:  129
 intent count:   26


In [13]:
atis_df.head()

Unnamed: 0,label,text,description
0,flight,i want to fly from boston at 838 am and arrive...,"Seeking information on specific flights, inclu..."
1,flight,what flights are available from pittsburgh to ...,"Seeking information on specific flights, inclu..."
2,flight_time,what is the arrival time in san francisco for ...,Requesting the schedule or times of flights on...
3,airfare,cheapest airfare from tacoma to orlando,"Seeking information on the cost of flights, in..."
4,airfare,round trip fares from pittsburgh to philadelph...,"Seeking information on the cost of flights, in..."


In [14]:
atis_df['label'].unique()

array(['flight', 'flight_time', 'airfare', 'aircraft', 'ground_service',
       'airport', 'airline', 'distance', 'abbreviation', 'ground_fare',
       'quantity', 'city', 'flight_no', 'capacity', 'meal', 'restriction',
       'cheapest', 'day_name'], dtype=object)

In [15]:
# Print for each intent how many samples there are
atis_df.groupby('label').count()

Unnamed: 0_level_0,text,description
label,Unnamed: 1_level_1,Unnamed: 2_level_1
abbreviation,180,180
aircraft,90,90
airfare,471,471
airline,195,195
airport,38,38
capacity,37,37
cheapest,1,1
city,25,25
day_name,2,2
distance,30,30


### Sample subsets



In [16]:
for ver in range(1, 6):
  os.makedirs(f'datasets/ATIS/sampled_subsets/ver{ver}', exist_ok=True)

  atis_5_subset = sample_subset(atis_train_df, 5)
  atis_5_subset.to_csv(f'datasets/ATIS/sampled_subsets/ver{ver}/atis_5_subset_ver.csv', index=False)
  atis_10_subset = sample_subset(atis_train_df, 10)
  atis_10_subset.to_csv(f'datasets/ATIS/sampled_subsets/ver{ver}/atis_10_subset.csv', index=False)
  atis_20_subset = sample_subset(atis_train_df, 20)
  atis_20_subset.to_csv(f'datasets/ATIS/sampled_subsets/ver{ver}/atis_20_subset.csv', index=False)
  atis_50_subset = sample_subset(atis_train_df, 50)
  atis_50_subset.to_csv(f'datasets/ATIS/sampled_subsets/ver{ver}/atis_50_subset.csv', index=False)
  atis_100_subset = sample_subset(atis_train_df, 100)
  atis_100_subset.to_csv(f'datasets/ATIS/sampled_subsets/ver{ver}/atis_100_subset.csv', index=False)

  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambd

## TREC

### Load dataset

In [17]:
!wget https://cogcomp.seas.upenn.edu/Data/QA/QC/train_1000.label -P inputs/TREC
!wget https://cogcomp.seas.upenn.edu/Data/QA/QC/train_2000.label -P inputs/TREC
!wget https://cogcomp.seas.upenn.edu/Data/QA/QC/train_3000.label -P inputs/TREC
!wget https://cogcomp.seas.upenn.edu/Data/QA/QC/train_4000.label -P inputs/TREC
!wget https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label -P inputs/TREC
!wget https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label -P inputs/TREC

zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget


In [18]:
# Process into a df
def load_trec(filename):
    samples = []
    with open(filename, 'r', encoding='windows-1252') as f:
        for line in f:
            label, text = line.split(' ', 1)
            samples.append({
                "label": label,
                "text": text.strip(),
                "description": None
            })
    return pd.DataFrame(samples)

train_1000_df = load_trec('inputs/TREC/train_1000.label')
train_2000_df = load_trec('inputs/TREC/train_2000.label')
train_3000_df = load_trec('inputs/TREC/train_3000.label')
train_4000_df = load_trec('inputs/TREC/train_4000.label')
train_5500_df = load_trec('inputs/TREC/train_5500.label')
test_df = load_trec('inputs/TREC/TREC_10.label')

trec_df = pd.concat([train_1000_df, train_2000_df, train_3000_df, train_4000_df, train_5500_df, test_df])

descriptions = description_generator(trec_df, n_data_examples=5)
trec_df['description'] = trec_df['label'].map(descriptions)

os.makedirs('datasets/TREC', exist_ok=True)
trec_df.to_csv('datasets/TREC/trec.full.csv', index=False)

# split to train 80%, validation 10% test 10%
trec_train_df, trec_test_df = train_test_split(trec_df, test_size=0.2)
trec_valid_df, trec_test_df = train_test_split(trec_test_df, test_size=0.5)

trec_train_df.to_csv('datasets/TREC/trec.train.csv', index=False)
trec_valid_df.to_csv('datasets/TREC/trec.valid.csv', index=False)
trec_test_df.to_csv('datasets/TREC/trec.test.csv', index=False)

trec_df.head()


Unnamed: 0,label,text,description
0,DESC:manner,How did serfdom develop in and then leave Russ...,Questions about the process or method by which...
1,ENTY:cremat,What films featured the character Popeye Doyle ?,Inquiries about creative works such as TV show...
2,DESC:manner,How can I find a list of celebrities ' real na...,Questions about the process or method by which...
3,ENTY:animal,What fowl grabs the spotlight after the Chines...,"Questions related to animals, including their ..."
4,ABBR:exp,What is the full form of .com ?,Inquiries about the meaning or full form of an...


In [19]:
trec_df['label'].unique()

array(['DESC:manner', 'ENTY:cremat', 'ENTY:animal', 'ABBR:exp', 'HUM:ind',
       'HUM:gr', 'HUM:title', 'DESC:def', 'NUM:date', 'DESC:reason',
       'ENTY:event', 'LOC:state', 'DESC:desc', 'NUM:count', 'ENTY:other',
       'ENTY:letter', 'LOC:other', 'ENTY:religion', 'ENTY:food',
       'LOC:country', 'ENTY:color', 'ENTY:termeq', 'LOC:city',
       'ENTY:body', 'ENTY:dismed', 'LOC:mount', 'NUM:money',
       'ENTY:product', 'NUM:period', 'ENTY:substance', 'ENTY:sport',
       'ENTY:plant', 'ENTY:techmeth', 'NUM:volsize', 'HUM:desc',
       'ENTY:instru', 'ABBR:abb', 'NUM:other', 'NUM:speed', 'ENTY:word',
       'ENTY:lang', 'NUM:perc', 'NUM:code', 'NUM:dist', 'NUM:temp',
       'ENTY:symbol', 'NUM:ord', 'ENTY:veh', 'NUM:weight',
       'ENTY:currency'], dtype=object)

In [20]:
trec_df.groupby('label').count()

Unnamed: 0_level_0,text,description
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ABBR:abb,47,47
ABBR:exp,203,203
DESC:def,1344,1344
DESC:desc,781,781
DESC:manner,768,768
DESC:reason,549,549
ENTY:animal,381,381
ENTY:body,56,56
ENTY:color,129,129
ENTY:cremat,595,595


### Sample subsets

In [21]:
for ver in range(1, 6):
  os.makedirs(f'datasets/TREC/sampled_subsets/ver{ver}', exist_ok=True)

  trec_5_subset = sample_subset(trec_train_df, 5)
  trec_5_subset.to_csv(f'datasets/TREC/sampled_subsets/ver{ver}/trec_5_subset.csv', index=False)
  trec_10_subset = sample_subset(trec_train_df, 10)
  trec_10_subset.to_csv(f'datasets/TREC/sampled_subsets/ver{ver}/trec_10_subset.csv', index=False)
  trec_20_subset = sample_subset(trec_train_df, 20)
  trec_20_subset.to_csv(f'datasets/TREC/sampled_subsets/ver{ver}/trec_20_subset.csv', index=False)
  trec_50_subset = sample_subset(trec_train_df, 50)
  trec_50_subset.to_csv(f'datasets/TREC/sampled_subsets/ver{ver}/trec_50_subset.csv', index=False)
  trec_100_subset = sample_subset(trec_train_df, 100)
  trec_100_subset.to_csv(f'datasets/TREC/sampled_subsets/ver{ver}/trec_100_subset.csv', index=False)

  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambda x: x.sample(n=min(n, len(x)))).reset_index(drop=True)
  return df.groupby('label').apply(lambd

## Push to Github

In [15]:
!git config --global user.email 'email'
!git config --global user.name 'username'

!git clone https://<USERNAME>:<TOKEN>@github.com/elephanti/NLPProject2024.git
%cd NLPProject2024
!cp -r /content/datasets /content/NLPProject2024/datasets
!cp -r /content/inputs /content/NLPProject2024/inputs

!git add datasets
!git add inputs
!git commit -m "Add generated datasets"
!git push

Cloning into 'NLPProject2024'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 7 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (7/7), 152.86 KiB | 756.00 KiB/s, done.
/content/NLPProject2024
fatal: /content/datasets: '/content/datasets' is outside repository at '/content/NLPProject2024'
fatal: /content/inputs: '/content/inputs' is outside repository at '/content/NLPProject2024'
On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mdatasets/[m
	[31minputs/[m

nothing added to commit but untracked files present (use "git add" to track)
Everything up-to-date
