# Short text predictor

In this notebook, we will do the following 
1. Use a simple data set (ATIS) https://www.kaggle.com/hassanamin/atis-airlinetravelinformationsystem 
2. Create three types of splits. Every split will create three types of data - training data, testing data and productiond data
    - Random split
    - Length based split 
    - Confidence based split

Length and confidence based split are used to introduce some drift

3. Train an MLPClassifier model on the train data. Verify on test data and also look at the performance of the model on the production data
4. Fit a short text predictor using the trained model, train/test data
5. Calculate the predictions as returned by the predictor on the production data


In [1]:
import sys
sys.path.append("/Users/anupamamurthi/Documents/GitHub/UQ360/")


# Performance Predictor on ATIS dataset


In [2]:
# Load ATIS raw data. A pointer to the data can also be found here: https://www.kaggle.com/hassanamin/atis-airlinetravelinformationsystem/version/1
import pandas as pd

import os
li_data = []
li_labels = []
li_len = []

df = pd.read_csv(
    '/Users/anupamamurthi/Documents/GitHub/UQ360/data/text/atis/atis.train.w-intent.iob.csv', index_col=None, header=0)

df.head()

Unnamed: 0.1,Unnamed: 0,example,intent,iob
0,0,i want to fly from boston at 838 am and arrive...,atis_flight,O O O O O O B-fromloc.city_name O B-depart_tim...
1,1,what flights are available from pittsburgh to ...,atis_flight,O O O O O O B-fromloc.city_name O B-toloc.city...
2,2,what is the arrival time in san francisco for ...,atis_flight_time,O O O O B-flight_time I-flight_time O B-fromlo...
3,3,cheapest airfare from tacoma to orlando,atis_airfare,O B-cost_relative O O B-fromloc.city_name O B-...
4,4,round trip fares from pittsburgh to philadelph...,atis_airfare,O B-round_trip I-round_trip O O B-fromloc.city...


In [3]:
# create training data and labels
li_data.append(df['example'])
li_labels.append(df['intent']) 

frame = pd.concat(li_data, axis=0, ignore_index=True)
npdata = frame.to_numpy()

frame_labels = pd.concat(li_labels, axis=0, ignore_index=True)
npdata_labels = frame_labels.to_numpy()
 
print("Data")
print(npdata[:10])

print("Labels data")
print(npdata_labels[:10])

Data
['i want to fly from boston at 838 am and arrive in denver at 1110 in the morning'
 'what flights are available from pittsburgh to baltimore on thursday morning'
 'what is the arrival time in san francisco for the 755 am flight leaving washington'
 'cheapest airfare from tacoma to orlando'
 'round trip fares from pittsburgh to philadelphia under 1000 dollars'
 'i need a flight tomorrow from columbus to minneapolis'
 'what kind of aircraft is used on a flight from cleveland to dallas'
 'show me the flights from pittsburgh to los angeles on thursday'
 'all flights from boston to washington'
 'what kind of ground transportation is available in denver']
Labels data
['atis_flight' 'atis_flight' 'atis_flight_time' 'atis_airfare'
 'atis_airfare' 'atis_flight' 'atis_aircraft' 'atis_flight' 'atis_flight'
 'atis_ground_service']


In [4]:
# Simple implementation to fit and transform text data

import numpy as np
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow_hub as hub


class UseTransformer(TransformerMixin):
    '''
    Wrapper to run the Universal Sentence Embeddings (USE) encoder.
    Organizes the USE into the fit, transform and fit_transform standard methods of TransformerMixin.
    '''
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X):
        encoder = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
        return encoder(X.ravel()).numpy()

In [5]:
def train_model(x, y):
    """
    returns model object
    """
    from sklearn.neural_network import MLPClassifier 
    model = MLPClassifier()
    model.fit(x, y)
    
    return model

In [6]:
def train_model_svm(x, y):
    """
    returns model object
    """
    from sklearn.svm import SVC
    model = SVC(probability=True)
    model.fit(x, y)
    
    return model

# Use Case: Random Split


In [7]:
# Create train test prod data

def create_train_test_prod_split(x, y, test_size=0.25 ):
    """
    returns x_train, y_train, x_test, y_test, x_prod, y_prod
    """
    from sklearn.model_selection import StratifiedKFold, train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                                    test_size=0.25, 
                                                                random_state=42)

    x_test, x_prod, y_test, y_prod = train_test_split(x_test, y_test,
                                                                    test_size=0.25, 
                                                                random_state=42)

    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_prod.shape, y_prod.shape)

    
    print("Training data size:", x_train.shape)
    print("Test data size:", x_test.shape)
    print("Prod data size:", x_prod.shape)
    
    return x_train, y_train, x_test, y_test, x_prod, y_prod

In [8]:
print("Total data points", npdata.shape)
x_train, y_train, x_test, y_test, x_prod, y_prod = create_train_test_prod_split(npdata, npdata_labels)

Total data points (4952,)
(3714,) (3714,) (928,) (928,) (310,) (310,)
Training data size: (3714,)
Test data size: (928,)
Prod data size: (310,)


In [11]:
# Fit a basic SVM classifier after encoding

import logging
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)

obj = UseTransformer()
print("Training data before encoding", x_train.shape)
x_train_encoded = obj.transform(X=x_train)
print("Training data after encoding", x_train_encoded.shape)
model = train_model_svm(x_train_encoded, y_train)

Training data before encoding (3714,)


2021-10-03 22:59:57.545092: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-03 22:59:59.511597: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Training data after encoding (3714, 512)


In [12]:
x_test_encoded = obj.transform(X=x_test)
print("Test data after encoding", x_test_encoded.shape)

x_prod_encoded = obj.transform(X=x_prod)
print("Prod data after encoding", x_prod_encoded.shape)

# acc on test data
acc = model.score(x_test_encoded, y_test)
print("acc on test", acc)


# acc on prod data
score = model.score(x_prod_encoded, y_prod)
print("acc on prod", score)

Test data after encoding (928, 512)
Prod data after encoding (310, 512)
acc on test 0.9633620689655172
acc on prod 0.9290322580645162


In [13]:
model

SVC(probability=True)

In [14]:
# It is possible to train the predictor using encoded data or using raw text. 

# In the below example, we are using raw text to train the predictor but x_train can be swapped with x_train_encoded

from uq360.algorithms.blackbox_metamodel.short_text_classification import ShortTextClassificationWrapper
import numpy as np
import pickle

p = ShortTextClassificationWrapper(base_model=model)

p.fit(x_train, y_train, x_test, y_test)

Batch features : None
Pointwise features : ['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm', 'class_frequency', 'mlp', 'svc']
Blackbox features : None
Predictor type : text_ensemble
Features extracted for : odict_keys(['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm_1', 'gbm_2', 'class_frequency', 'mlp_1', 'mlp_2', 'svc_1', 'svc_2'])


In [15]:
# Check the predictions on prod data

# x_prod is raw text. Predicting using raw text. Encoding happens inside the Predictor

out, y_pred, y_score = p.predict(x_prod)

Incoming data contains raw text.
Using an encoder.... %s <uq360.utils.utils.UseTransformer object at 0x19fb71c40>
Shapes before encoding %s (310,)
Shapes after encoding %s (310, 512)
Features extracted for : odict_keys(['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm_1', 'gbm_2', 'class_frequency', 'mlp_1', 'mlp_2', 'svc_1', 'svc_2'])


In [16]:
out

92.146829810901

# Use Case: Length based split


In [17]:
x_1 = np.concatenate((x_train, x_test, x_prod), axis=0)
y_1 = np.concatenate((y_train, y_test, y_prod), axis =0)

base_model = model

In [18]:

# calculate length of every "intent"
len_of_x_1 = np.asarray([len(i) for i in x_1])

In [19]:
# find the median
median = np.median(len_of_x_1)

In [20]:
# create two buckets - one bucket holds all the indices that are less than median length and the other bucket holds all the indices > median legth
len_less_than_median = np.where(len_of_x_1 < median)
len_greater_than_median = np.where(len_of_x_1 >= median)

In [21]:
# training/test data -> 70 pct from less_than_median and 30 pct from greater_than_median

total_samples = x_1.shape[0]
train_test_samples = x_1.shape[0] * 0.5
prod_samples = train_test_samples

print("Total samples", total_samples)
print("Train test samples", train_test_samples)
print("Prod samples", prod_samples)

Total samples 4952
Train test samples 2476.0
Prod samples 2476.0


In [22]:
# Fit a basic classifier after encoding

import logging
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)

obj = UseTransformer()
print("Training data before encoding", x_1.shape)
x_1_train_encoded = obj.transform(X=x_1)
print("Training data after encoding", x_1_train_encoded.shape)

Training data before encoding (4952,)
Training data after encoding (4952, 512)


In [23]:
def split(x, y, bucket_1_indices, bucket_2_indices, split_ratio=0.3, test_size = 0.25): 
    """
    returns: x_train, y_train, x_test, y_test, x_prod, y_prod
    """
    train_test_samples = x.shape[0] * 0.5
    training_test_data_from_bucket_1 = np.random.choice(bucket_1_indices[0],  int(train_test_samples *split_ratio), replace=False )
    training_test_data_from_bucket_2 = np.random.choice(bucket_2_indices[0],  int(train_test_samples *(1-split_ratio)), replace=False )
    
    prod_data_from_bucket_1 = np.setdiff1d (bucket_1_indices, training_test_data_from_bucket_1)
    prod_data_from_bucket_2 = np.setdiff1d (bucket_2_indices, training_test_data_from_bucket_2)

    training_test_data_indices = np.concatenate((training_test_data_from_bucket_1, training_test_data_from_bucket_2), axis=0)
    prod_indices = np.concatenate((prod_data_from_bucket_1, prod_data_from_bucket_2), axis=0)

    training_test_data = x[training_test_data_indices]
    training_test_label = y[training_test_data_indices]

    prod_test_data = x[prod_indices]
    prod_test_label = y[prod_indices]


    from sklearn.model_selection import StratifiedKFold, train_test_split
    x_train_new, x_test_new, y_train_new, y_test_new = train_test_split(training_test_data, training_test_label,
                                                                    test_size=test_size, 
                                                                random_state=42)


 

    print("Training data size:", x_train_new.shape)
    print("Test data size:", x_test_new.shape)
    print("Prod data size:", prod_test_data.shape)
    
    return x_train_new, y_train_new, x_test_new, y_test_new, prod_test_data, prod_test_label

In [24]:
def train_model_svm(x, y):
    """
    returns model object
    """
    from sklearn.svm import SVC
    model = SVC(probability=True)
    model.fit(x, y)
    
    return model

In [25]:
# split
x_1_train, y_1_train, x_1_test, y_1_test, x_1_prod, y_1_prod = split(x_1_train_encoded, y_1,len_less_than_median, len_greater_than_median, 0.2)

Training data size: (1856, 512)
Test data size: (619, 512)
Prod data size: (2477, 512)


In [26]:
# train a model
model_train_on_length_based_split = train_model_svm(x_1_train, y_1_train)

# check accuracy on test and prod set
print("accuracy on test", model_train_on_length_based_split.score(x_1_test, y_1_test))
print("accuracy on prod", model_train_on_length_based_split.score(x_1_prod, y_1_prod))

accuracy on test 0.9192245557350566
accuracy on prod 0.9236980218005651


In [27]:
# fit a predictor

# from performance_predictors import Predictor
import numpy as np
import pickle

from uq360.algorithms.blackbox_metamodel.short_text_classification import ShortTextClassificationWrapper
import numpy as np
import pickle

p2 = ShortTextClassificationWrapper(base_model=model_train_on_length_based_split)

p2.fit(x_1_train, y_1_train, x_1_test, y_1_test)

pred, y_pred, y_score = p2.predict(x_1_prod)
print(pred)

Batch features : None
Pointwise features : ['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm', 'class_frequency', 'mlp', 'svc']
Blackbox features : None
Predictor type : text_ensemble
Features extracted for : odict_keys(['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm_1', 'gbm_2', 'class_frequency', 'mlp_1', 'mlp_2', 'svc_1', 'svc_2'])
Incoming data is already encoded
Features extracted for : odict_keys(['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm_1', 'gbm_2', 'class_frequency', 'mlp_1', 'mlp_2', 'svc_1', 'svc_2'])
87.47826237948264


In [35]:
print("predicted accuracy", pred)

predicted accuracy 87.47826237948264


# Use Case: Median Confidence based split


In [28]:
# Grab x and y by concatenating train, test, prod data
x = np.concatenate((x_train_encoded, x_test_encoded, x_prod_encoded), axis=0)

In [29]:

y = np.concatenate((y_train, y_test, y_prod), axis =0)

In [30]:
base_model = model

# use the base model and grab the top confidence for every data point that we have

x_proba = base_model.predict_proba(x)
confs_sorted = np.sort(x_proba) 
top_confs = confs_sorted[:,-1]

In [31]:
# find the median
median  = np.median(top_confs)

In [32]:
# create two buckets
less_than_median = np.where(top_confs < median)
greater_than_median = np.where(top_confs >= median)

In [33]:
# Given the two buckets, shuffle the data into train, test, prod sets

x_train_new, y_train_new, x_test_new, y_test_new, prod_test_data, prod_test_label = split(x, y,less_than_median,greater_than_median, 0.3)

Training data size: (1856, 512)
Test data size: (619, 512)
Prod data size: (2477, 512)


In [34]:
# train a new model using the training data created in the previous step
model_trained_on_conf_based_split = train_model_svm(x_train_new, y_train_new)
# acc on test data
acc = model_trained_on_conf_based_split.score(x_test_new, y_test_new)
print("acc on test", acc)

# acc on prod data
acc = model_trained_on_conf_based_split.score(prod_test_data, prod_test_label)
print("acc on prod", acc)

acc on test 0.9499192245557351
acc on prod 0.846992329430763


In [38]:
# train a performance predictor

from uq360.algorithms.blackbox_metamodel.short_text_classification import ShortTextClassificationWrapper
import numpy as np
import pickle

p1 = ShortTextClassificationWrapper(base_model=model_trained_on_conf_based_split)

p1.fit(x_train_new, y_train_new, x_test_new, y_test_new)

pred, y_pred, y_score = p1.predict(prod_test_data)


Batch features : None
Pointwise features : ['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm', 'class_frequency', 'mlp', 'svc']
Blackbox features : None
Predictor type : text_ensemble
Features extracted for : odict_keys(['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm_1', 'gbm_2', 'class_frequency', 'mlp_1', 'mlp_2', 'svc_1', 'svc_2'])
Incoming data is already encoded
Features extracted for : odict_keys(['confidence_top', 'confidence_delta', 'confidence_entropy', 'gbm_1', 'gbm_2', 'class_frequency', 'mlp_1', 'mlp_2', 'svc_1', 'svc_2'])


In [39]:
print("Predicted accuracy" , pred)

Predicted accuracy 79.11417297198746
