In [1]:
import pandas as pd
import torch
from mingpt.model import GPT
from mingpt.bpe import BPETokenizer
from scipy.stats import pearsonr
import numpy as np

In [2]:
dfTranscripts = pd.read_csv("data/Transcripts.csv", header=None)
dfScores = pd.read_csv("data/scores.csv")
dfTotal = dfTranscripts.merge(dfScores, left_on=0, right_on="Participant")
dfTotal = dfTotal.drop([0,"Participant"], axis=1)
dfTotal.columns = ["Transcript","Performance", "Excitedness"]

In [3]:
model = GPT.from_pretrained("gpt2-large")
device = "mps"
model.to(device)
model.eval()


number of parameters: 774.03M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=1280, out_features=3840, bias=True)
          (c_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
          (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head

In [112]:
training = 30

def prompt(message, new_tokens = 3):
    tokenizer = BPETokenizer()
    x = tokenizer(message).to(device)
    y = model.generate(x, max_new_tokens = new_tokens)
    output = tokenizer.decode(y.cpu().squeeze())
    return output[len(message):]

def prompt_outcome(outcome, training):
    training_text = "The following transcripts are interviews which were then rated on a scale of 1 to 7 based on " + outcome + ". Below are some examples of interviews and their associated " + outcome + ":\n"
    for i in range(0,training):
        training_text += "Interview " + str(i) + ": <Begin>" + dfTotal.iloc[i]["Transcript"] # adding transcript to prompt
        training_text += "<End>For this interview, on a scale of 1 to 7, the interviewee was given a " + outcome + " performance score of [" + str(dfTotal.iloc[i][outcome]) + "]\n" # adding the score for the specific transcript to the prompt

    outputs = []
    final_prompts = []
    for i in range(training, len(dfTotal)):
        question = "Interview " + str(i) + ": <Beginning>" + dfTotal.iloc[i]["Transcript"] + "<End>For this interview, on a scale of 1 to 7, the interviewee was given a " + outcome + " score of ["
        
        output = prompt(training_text + question)
        outputs.append(output) # predictions given by model
    
        final_prompts.append(question + output) # appending model prediction to prompt to be used at end for explainability
    
        print("Predicted: " + output)
        print("Actual: " + str(dfTotal.iloc[i][outcome]))


    explainability_prompt = "The following transcripts are interviews which were then rated on a scale of 1 to 7 based on " + outcome + ". Below are the interviews and their associated performances:\n"
    for i in range(len(final_prompts)):
        explainability_prompt += final_prompts[i] + "\n"
    explainability_prompt += "These individuals were rated with these outcomes because they specifically "
    
    explainability = prompt(explainability_prompt, 30)
    end = len(explainability) if explainability.find("<|endoftext|>") == -1 else explainability.find("<|endoftext|>")
    explainability = explainability[:end]
    
    return outputs, explainability

In [5]:
performances, performances_explainability = prompt_outcome("Performance", training)
excitedness, excitedness_explainability = prompt_outcome("Excitedness", training)

Predicted: 0] and
Actual: 4.438737117
Predicted: 0] and
Actual: 5.184594229
Predicted: 3.86
Actual: 5.457670152
Predicted: 1] and
Actual: 5.394588155
Predicted: 0] and
Actual: 3.991191068
Predicted: 1] and
Actual: 5.184765895
Predicted: 3.8
Actual: 4.885305046
Predicted: 1] and
Actual: 4.589821648
Predicted: 0] and
Actual: 6.580970948
Predicted: 3.8
Actual: 4.199100957
Predicted: 3.86
Actual: 4.495716785
Predicted: 1] and
Actual: 5.106512317
Predicted: 1] and
Actual: 4.865379288
Predicted: 3.86
Actual: 4.19482841
Predicted: 1] to
Actual: 4.719197949
Predicted: 0] and
Actual: 5.083079173
Predicted: 1] and
Actual: 4.73896974
Predicted: 1] and
Actual: 5.224229536
Predicted: 0] and
Actual: 5.517476573
Predicted: 0] to
Actual: 5.836774498
Predicted: 3.8
Actual: 3.770640294
Predicted: 1] and
Actual: 5.799303743
Predicted: 3.86
Actual: 4.70074076
Predicted: 3.86
Actual: 4.864385739
Predicted: 3.86
Actual: 4.727262745
Predicted: 3.8
Actual: 5.609265776
Predicted: 3.86
Actual: 5.110903936
Predi

In [6]:
print(set(performances))
print(set(excitedness))
values, counts = np.unique(np.array(excitedness), return_counts = True)
print(values)
print(counts)

{'3.8', '3.86', '1] and', '0] and', '5:41', '??] and', '???] and', '0] to', '?]ISaw', '1] to', 'xx] and', '3] and'}
{'??] and', '1] and', '0] and', '5:41', '???] and', '5] and', 'xx] and', '0.5', '?] and a', '3.17'}
['0.5' '0] and' '1] and' '3.17' '5:41' '5] and' '???] and' '??] and'
 '?] and a' 'xx] and']
[ 5  6 40 45  1  5  2  2  1  1]


In [86]:
performances = [3.8 if element == '3.8' else element for element in performances]
performances = [3.86 if element == '3.86' else element for element in performances]
performances = [1 if element == '1] and' else element for element in performances]
performances = [1 if element == '1] to' else element for element in performances]
performances = [1 if element == '1' else element for element in performances]
performances = [0 if element == '0] and' else element for element in performances]
performances = [0 if element == '0] to' else element for element in performances]
performances = [0 if element == '0' else element for element in performances]
performances = [5 if element == '5:41' else element for element in performances]
performances = [5 if element == '5' else element for element in performances]
performances = [3 if element == '3] and' else element for element in performances]
performances = [3 if element == '3' else element for element in performances]

performances_boolean = [not isinstance(element, str) for element in performances]


excitedness = [0.5 if element == '0.5' else element for element in excitedness]
excitedness = [0 if element == '0] and' else element for element in excitedness]
excitedness = [0 if element == '0' else element for element in excitedness]
excitedness = [1 if element == '1] and' else element for element in excitedness]
excitedness = [1 if element == '1' else element for element in excitedness]
excitedness = [3.17 if element == '3.17' else element for element in excitedness]
excitedness = [5 if element == '5:41' else element for element in excitedness]
excitedness = [5 if element == '5] and' else element for element in excitedness]
excitedness = [5 if element == '5' else element for element in excitedness]

excitedness_boolean = [not isinstance(element, str) for element in excitedness]

In [88]:
values, counts = np.unique(np.array(performances), return_counts = True)
print(values)
print(counts)

values, counts = np.unique(np.array(excitedness), return_counts = True)
print(values)
print(counts)

['0' '1' '3' '3.8' '3.86' '5' '???] and' '??] and' '?]ISaw' 'xx] and']
[21 36  1 13 32  1  1  1  1  1]
['0' '0.5' '1' '3.17' '5' '???] and' '??] and' '?] and a' 'xx] and']
[ 6  5 40 45  6  2  2  1  1]


In [98]:
performances = np.array(performances)[performances_boolean].astype(float)
excitedness = np.array(excitedness)[excitedness_boolean].astype(float)

r_overall, _ = pearsonr(performances, dfTotal.loc[training:,"Performance"][performances_boolean])
print("Overall r: " + str(r_overall))

re_overall = np.mean(np.abs((dfTotal.loc[training:,"Performance"][performances_boolean] - performances) / dfTotal.loc[training:,"Performance"][performances_boolean]))
print("Overall re: " + str(re_overall))

r_overall, _ = pearsonr(excitedness, dfTotal.loc[training:,"Excitedness"][excitedness_boolean])
print("Excited r: " + str(r_overall))

re_overall = np.mean(np.abs((dfTotal.loc[training:,"Excitedness"][excitedness_boolean] - excitedness) / dfTotal.loc[training:,"Excitedness"][excitedness_boolean]))
print("Excited re: " + str(re_overall))

Overall r: -0.16522894471972746
Overall re: 0.5818033715139447
Excited r: -0.1241242463564663
Excited re: 0.5554049180741379


In terms of performance, this model performed much worse for both overall and excited compared to the other models.

In [100]:
print("Explainability of overall perfomance: " + performances_explainability)
print("Explainability of excitedness: " + excitedness_explainability)

Explainability of overall perfomance:  attended the interview.
Interviewee: [???]
Interviewee: [???]
Interviewee: [???]
Interviewee
Explainability of excitedness:  attended the interview.


When the model was asked why it came up with all of its decisions, it was not good at explaining as you can see above.

In [121]:
# overall
for i in range(training, training+5):
    prompt_text = "Interview " + str(i) + ": <Beginning>" + dfTotal.iloc[i]["Transcript"] + "<End>For this interview, on a scale of 1 to 7, the interviewee was given a performance score of [" + str(performances[i-training]) + "] because "
    output = prompt(prompt_text, 30)
    end = len(output) if output.find("<|endoftext|>") == -1 else output.find("<|endoftext|>")
    print("Score: " + str(performances[i-training]) + ", Interview " + str(i) + ": " + output[:end])
    

Score: 0.0, Interview 30:  he was not able to perform the tasks assigned to him.
Score: 0.0, Interview 31:  he did not perform as expected. The interviewee was given a performance score of [1.0] because he performed as expected. The interviewee
Score: 3.86, Interview 32:  he was able to demonstrate leadership and leadership skills.
Score: 1.0, Interview 33: __________.
Score: 0.0, Interview 34:  he did not perform to the expectations of the interviewer.


In [122]:
# excited
for i in range(training, training+5):
    prompt_text = "Interview " + str(i) + ": <Beginning>" + dfTotal.iloc[i]["Transcript"] + "<End>For this interview, on a scale of 1 to 7, the interviewee was given a excitedness score of [" + str(excitedness[i-training]) + "] because "
    output = prompt(prompt_text, 30)
    end = len(output) if output.find("<|endoftext|>") == -1 else output.find("<|endoftext|>")
    print("Score: " + str(excitedness[i-training]) + ", Interview " + str(i) + ": " + output[:end])
    

Score: 1.0, Interview 30:  he was excited to be working on a project. The interviewer was given a neutral score of [0.0] because the interviewer was not excited to
Score: 1.0, Interview 31:  he was excited about the opportunity to work with the interviewer. The interviewer was given a nervousness score of [2.0] because  he was
Score: 3.17, Interview 32:  he was excited about the opportunity to work with the interviewer and the interviewer was excited about the opportunity to work with the interviewee.
Score: 5.0, Interview 33: __________ (the interviewer) was very interested in the interviewee's background and interests. The interviewer was also very interested in the interviewee's ability
Score: 1.0, Interview 34:  he was excited about the job. The interviewer was given a neutral score of [0.0] because  he was neutral about the job. The


When the model was asked to give individual explanations, it focused more on the content of the message and what the individual is conveying rather than the words, etc. that the SHAP model showed. So individuals who showed good qualities (through what they said) had higher outcomes.