# Preprocessing

- This file collects preprocessing from travel_bert.ipynb, LinearSVC.ipynb and a tutorial from COLX_585. 
- The input is data path and domain name. The output is vectorized batches.

In [26]:
import torch
import torchtext
from torchtext.data import Field, LabelField
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import spacy
import en_core_web_sm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
## Set seed of randomization and working device
manual_seed = 77
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)

## 1. Read data

In [3]:
def read_data(path, checked = False):
    """
    This function reads a csv file and filters out na and non-freq tags.
    
    Parameters: 
    ------------
        path: data path
        checked: boolean (True-only use checked data, False-use all data)
    Return:
    ------------
        df: filtered data
    """
    df = pd.read_csv(path)
    df = df.dropna() # drop na
    if checked: # only use the checked data 
        df = df[df['Tags confirmed']=='checked'] 
    # remove tags that occur only once
    value_counts = df['Tags'].value_counts()
    remove_rows = value_counts[value_counts < 2].index
    df = df[~df.Tags.isin(remove_rows)]
    return df

In [22]:
def split_and_write(df, test_size=0.2, domain=domain):
    """
    This function splits train and test dataframe and write them to csv files.
    
    Parameters: 
    ------------
        df: data
        test_size: proportion of test 
        domain: domain name (vaccine, travel, etc)
    """
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
    train_df.to_csv(f"../data/{domain}_train.csv",columns=["Comment", "Tags"], index=False)
    test_df.to_csv(f"../data/{domain}_test.csv",columns=["Comment", "Tags"], index=False)

Unnamed: 0,Unique ID,Domain,Comment,Tags
0,6070f5c0800d871e0c75d919,Vaccine,I got my jab on March 29. Your literature says...,Vaccine effectiveness / delayed dosage
1,606ac6aa8d190c273ca7ebe3,Vaccine,How reliable the shipment is ?? \r\nSpending o...,Data and tracking vaccines
2,601c05426c4b8d189822fcec,Vaccine,Critical missing info:\r\nFed Govt needs to ma...,Data and tracking vaccines
3,604e366623caed19c087f936,Travel,When coming from Portugal and the itinerary ...,Hotels
4,604498689a91901f24b82c39,Travel,Pre-entry test requirements:\nYou must show pr...,Testing


In [2]:
#pip install contextualSpellCheck

In [None]:
path = "../data/Sample comments.csv"
domain = "travel" # vac, travel
df = read_data(path)
split_and_write(df, test_size=0.2, domain=domain)

## 2. Torch Text

#### Define tokenizer

In [8]:
spacy_en = en_core_web_sm.load()
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]



#### Define fields

In [10]:
TEXT = Field(sequential=True, tokenize=tokenize_en, lower=True)
LABEL = Field(sequential=False, unk_token = None)

#### Load dataset using TabularDataset

In [28]:
train, test = TabularDataset.splits(
               path="../data/", 
               train=f'{domain}_train.csv', test=f"{domain}_test.csv", # file names
               format='csv', # file format
               skip_header=True, # skip the first row (header)
               fields=[('Comment', TEXT), ('Tags', LABEL)])

#### Build vocabulary to map words and labels to integers.

In [29]:
TEXT.build_vocab(train, min_freq=2)
LABEL.build_vocab(train)

In [30]:
print("Vocabulary size of TEXT:",len(TEXT.vocab.stoi))
print("Vocabulary size of LABEL:",len(LABEL.vocab.stoi))

Vocabulary size of TEXT: 42
Vocabulary size of LABEL: 4


#### Construct the Iterators to get the train and test splits. Use BucketIterator to initialize the Iterators for the train and test data.

In [36]:
train_iter, test_iter = BucketIterator.splits(
 (train, test),
 batch_sizes=(64,256,256),
 sort_key=lambda x: len(x.Comment), 
 sort=True,
 sort_within_batch=True
)

In [39]:
# take a look at a single batch, each batch has comment shape [seq, batch size], tag shape [batch size]
for batch in train_iter:
    comments = batch.Comment
    tags = batch.Tags
    break  #we use first batch as an example.
    
print('Comments:', comments.shape)
print('Tags:', tags.shape)

Comments: torch.Size([66, 4])
Tags: torch.Size([4])
