<h1>Export Predictions on Evaluation Dataset</h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

<br><h2>1. Loading Dataset</h2><br>

In [2]:
test_df = pd.read_csv('data/evaluation_data.csv')
test_df

Unnamed: 0.1,Unnamed: 0,sid,transcript,partial_transcripts
0,0,CF8fd2e7ac0e4ff2316bb18a9ffe5e9e68,Your call has been forwarded to an automated v...,(0): You're?/(0.065): Your call./(0.422): Your...
1,1,CF2a9819f31261b93230e2ad68888bb479,Lamancha. This is Carrie. Can I help you?,(0): Ramon./(0.119): Clermont./(0.365): Lamanc...
2,2,CF94166971f53d5b09ac2e411755ead266,"Yes, so let me says hello, you've reached this...",(0): Yes./(0.129): Who was it just for?/(0.255...
3,3,CF13315d4973c7c6333ed31aac7b406f46,Brian toner.,(0): Ryan./(0.245): Ryan tone./(0.265): Brian ...
4,4,CFd61f7d8a06b913c4b2170bebe99b3331,"Hello, darling. Static is not available. Pleas...","(0): Hello./(0.51): Hello./(0.549): Hello, ba..."
...,...,...,...,...
725,725,CFa21859e752416d642f9d93aa1c290f66,718 is not available to take your call. Please...,(0): Seven./(0.311): 337./(0.328): 371./(0.73)...
726,726,CF5d58b2164c51a9ec5295e48c16beca5c,Zack Hess is currently unavailable.,(0): Zach./(0.448): Zack Hess./(0.449): Zack./...
727,727,CFf631ae0c49dc94fd43ae77da07c233e5,"Who you've reached, Jessica, Russell, Gilliam,...",(0): Who./(0.626): Who./(0.791): Who you've r...
728,728,CFc52805d7b70ebf85c0a924d3f2ef6749,"Hi, you've reached brooks'. Schaefer. I'm not ...","(0): Hi./(0.011): hi, you've/(0.393): Hi, you'..."


In [3]:
test_df.isnull().sum()

Unnamed: 0              0
sid                     0
transcript             13
partial_transcripts    13
dtype: int64

In [4]:
test_df["transcript"] = test_df["transcript"].fillna("")

In [5]:
test_df.isnull().sum()

Unnamed: 0              0
sid                     0
transcript              0
partial_transcripts    13
dtype: int64

In [6]:
test_df[test_df.duplicated()]

Unnamed: 0.1,Unnamed: 0,sid,transcript,partial_transcripts


In [7]:
test_ds = Dataset.from_pandas(test_df[["transcript"]])
test_ds

Dataset({
    features: ['transcript'],
    num_rows: 730
})

<br><h2>2. Loading Model</h2><br>

In [8]:
tokenizer_test = AutoTokenizer.from_pretrained("results/best_model")

In [9]:
def preprocess_function(examples):
    return tokenizer_test(examples["transcript"], truncation=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
model_test = AutoModelForSequenceClassification.from_pretrained("results/best_model", num_labels=2)

In [11]:
trainer_test = Trainer(
    model=model_test,
    args=TrainingArguments(output_dir="./results"),
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_test)
)

<br><h2>3. Export Predictions</h2><br>

In [12]:
test_predictions = trainer_test.predict(tokenized_test)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: transcript. If transcript are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 730
  Batch size = 8


In [13]:
test_df["label"] = test_predictions.predictions.argmax(-1)

In [14]:
test_df["label"] = test_df["label"].map({0:"voicemail", 1:"human"})

In [15]:
test_df[test_df.label == "voicemail"]

Unnamed: 0.1,Unnamed: 0,sid,transcript,partial_transcripts,label
0,0,CF8fd2e7ac0e4ff2316bb18a9ffe5e9e68,Your call has been forwarded to an automated v...,(0): You're?/(0.065): Your call./(0.422): Your...,voicemail
2,2,CF94166971f53d5b09ac2e411755ead266,"Yes, so let me says hello, you've reached this...",(0): Yes./(0.129): Who was it just for?/(0.255...,voicemail
4,4,CFd61f7d8a06b913c4b2170bebe99b3331,"Hello, darling. Static is not available. Pleas...","(0): Hello./(0.51): Hello./(0.549): Hello, ba...",voicemail
5,5,CF8af87dca72e65ccca8f9262cf49ad1ef,"Hi, you've reached the voicemail box of been k...","(0): How you?/(0.029): hi, you've/(0.392): Hi,...",voicemail
6,6,CF2a26689474c087b9807565a878880b6b,You have reached the voicemail of Michael Burd...,(0): You./(0.005): You have./(0.022): You have...,voicemail
...,...,...,...,...,...
725,725,CFa21859e752416d642f9d93aa1c290f66,718 is not available to take your call. Please...,(0): Seven./(0.311): 337./(0.328): 371./(0.73)...,voicemail
726,726,CF5d58b2164c51a9ec5295e48c16beca5c,Zack Hess is currently unavailable.,(0): Zach./(0.448): Zack Hess./(0.449): Zack./...,voicemail
727,727,CFf631ae0c49dc94fd43ae77da07c233e5,"Who you've reached, Jessica, Russell, Gilliam,...",(0): Who./(0.626): Who./(0.791): Who you've r...,voicemail
728,728,CFc52805d7b70ebf85c0a924d3f2ef6749,"Hi, you've reached brooks'. Schaefer. I'm not ...","(0): Hi./(0.011): hi, you've/(0.393): Hi, you'...",voicemail


In [16]:
test_df[test_df.label == "human"]

Unnamed: 0.1,Unnamed: 0,sid,transcript,partial_transcripts,label
1,1,CF2a9819f31261b93230e2ad68888bb479,Lamancha. This is Carrie. Can I help you?,(0): Ramon./(0.119): Clermont./(0.365): Lamanc...,human
3,3,CF13315d4973c7c6333ed31aac7b406f46,Brian toner.,(0): Ryan./(0.245): Ryan tone./(0.265): Brian ...,human
9,9,CF2b59ce06f4671d0c7a7b5756f9d2ed7f,Hello.,(0): Hello./(0.808): Hello./(0.816): Hello.,human
10,10,CF8863ca3c1a886f4ad8e82d5d0148bf7a,Hello.,(0): Hello./(0.814): Hello./(0.827): Hello.,human
11,11,CFb2e307590ee4ee076bb2fa0c4366c40f,Hi. This is selling HR.,"(0): Why is the?/(0.237): Hi, is this?/(0.404)...",human
...,...,...,...,...,...
708,708,CF8e1bf207c057304127edc206c0c6e15e,Good afternoon. Thank you for calling Moda Hea...,(0): Good./(0.013): good after/(0.382): Good a...,human
713,713,CFa0cf2bcc85738645f269814516818aee,"Hello. Yes, speaking.",(0): Hello./(0.419): Hello./(2.024): Hello./(2...,human
720,720,CFd23228f5116af40add8c134d5df0a5cb,Hello.,(0): Hello./(0.331): Hello./(2.283): Hello./(6...,human
721,721,CFd8dcb910764cc9c8ead8de25e457e3ef,Hello.,(0): Hello./(0.797): Hello./(0.808): Hello./(3...,human


In [17]:
test_df.to_csv("results/evaluation_data_results.csv")