## How to aggregate results across different systems

We are going to load classifiers that we save before, use them to make predictions on some texts and aggregate the results in a pandas data frame to get an overview across systems

In [8]:
import pickle
import pandas as pd
import lab3_util as util

In [42]:
# some utterances
some_chat = ['That is sweet of you', 
               'You are so funny', 
               'Are you a man or a woman?', 
               'Chatbots make me sad and feel lonely.', 
               'Your are stupid and boring.', 
               'Two thumbs up', 
               'I fell asleep halfway through this conversation', 
               'Wow, I am really amazed.', 
               'You are amazing.',
             'I feel so low being in isolation',
             'People dumping waste are horrible',
             'Its awful that you cannot stop smoking',
             'Dogs scare me',
             'I am afraid I will get sick at work',
             'I run away when I see a dog',
             'When do you start your job?'
            ]

some_chat_gold_labels = ['joy', 'joy', 'neutral', 'sadness', 'anger', 'joy', 'anger', 'surprise', 'joy', 'sadness', 'disgust', 'disgust', 'fear', 'fear', 'fear', 'neutral']

### Loading a BoW model

In [29]:
# the countvectorizer
filename_vectorizer = './models/utterance_vec.sav'
# the tfidf transformer
filename_transformer = './models/utterance_transf.sav'
# the label encoder
filename_encoder = './models/label_encoder.sav'
# the classifier
filename_classifier = './models/svm_linear_clf_bow.sav'

loaded_bow_classifier = pickle.load(open(filename_classifier, 'rb'))
loaded_vectorizer = pickle.load(open(filename_vectorizer, 'rb'))
loaded_transformer = pickle.load(open(filename_transformer, 'rb'))
loaded_label_encoder = pickle.load(open(filename_encoder, 'rb'))

### Converting the test data to get the vector representations

In [30]:
counts_from_loaded_model = loaded_vectorizer.transform(some_chat)
some_chat_tfidf = loaded_transformer.transform(counts_from_loaded_model)

### Getting the predications

In [33]:
pred_from_bow_classifier = loaded_bow_classifier.predict(some_chat_tfidf)
print(pred_from_bow_classifier)
some_chat_bow_labels = []
for predicted_label in pred_from_bow_classifier:
    some_chat_bow_labels.append(loaded_label_encoder.classes_[predicted_label])
print(some_chat_bow_labels)

[3 3 4 5 4 4 4 6 3 4 4 4 4 4 4 4]
['joy', 'joy', 'neutral', 'sadness', 'neutral', 'neutral', 'neutral', 'surprise', 'joy', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']


## Embedding model

### Loading an embedding model

In [35]:
# the classifier
filename_classifier = './models/svm_linear_clf_embeddings.sav'
# the frequent keywords
filename_freq_keywords = './models/frequent_keywords.sav'
loaded_embedding_classifier = pickle.load(open(filename_classifier, 'rb'))
loaded_frequent_keywords = pickle.load(open(filename_freq_keywords, 'rb'))

### Representing the test data

In [36]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

wordembeddings = "glove-twitter-25"
### this model has 25 dimensions so we set the number of features to 25
num_features = 25

word_embedding_model = api.load(wordembeddings)
print(num_features)

25


In [37]:
def tokenize_data(text):
    ### the first loop gets the utterances
    text_tokens = []
    for utterance in text:
        text_tokens.append(nltk.tokenize.word_tokenize(utterance))
    return text_tokens

In [38]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
index2word_set = set(word_embedding_model.wv.index2word)


some_chat_tokens = tokenize_data(some_chat)
some_chat_embedding_vectors = util.getAvgFeatureVecs(some_chat_tokens, loaded_frequent_keywords,stop_words, word_embedding_model, index2word_set, num_features)  

Shape of our matrix is: (16, 25)
Review 0 of 16


  after removing the cwd from sys.path.


### Making the predictions

In [39]:
pred_from_embedding_classifier = loaded_embedding_classifier.predict(some_chat_embedding_vectors[0])
print(pred_from_embedding_classifier)
some_chat_embedding_labels = []
for predicted_label in pred_from_embedding_classifier:
    some_chat_embedding_labels.append(loaded_label_encoder.classes_[predicted_label])
print(some_chat_embedding_labels)

[3 6 4 4 4 4 4 4 3 4 4 4 4 4 4 4]
['joy', 'surprise', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'joy', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']


## Putting the results together in a pandas frame

In [44]:
result_frame = pd.DataFrame()

# We add to this Pandas frame three more columns for the Chat, the Prediction and the Gold 
result_frame['Chat']=some_chat
result_frame['Gold']=some_chat_gold_labels

result_frame['Bow Prediction']=some_chat_bow_labels
result_frame['Embedding Prediction']=some_chat_embedding_labels

result_frame

Unnamed: 0,Chat,Gold,Bow Prediction,Embedding Prediction
0,That is sweet of you,joy,joy,joy
1,You are so funny,joy,joy,surprise
2,Are you a man or a woman?,neutral,neutral,neutral
3,Chatbots make me sad and feel lonely.,sadness,sadness,neutral
4,Your are stupid and boring.,anger,neutral,neutral
5,Two thumbs up,joy,neutral,neutral
6,I fell asleep halfway through this conversation,anger,neutral,neutral
7,"Wow, I am really amazed.",surprise,surprise,neutral
8,You are amazing.,joy,joy,joy
9,I feel so low being in isolation,sadness,neutral,neutral


## End of notebook