In [38]:
import pandas as pd
import numpy as np
import statistics as stat
import seaborn as sns
import matplotlib.pyplot as plt
import random
import re

#Loading the dataset and parsing the required information from the dataset. 

In [39]:
# Reading in data
prosodic_file = "data/prosodic_features.csv"
scores_file = "data/scores.csv"
transcript_file = "data/transcripts.csv"

# Read csvs into a file
prosodic_data = pd.read_csv(prosodic_file)
scores_data = pd.read_csv(scores_file)
transcript_data = pd.read_csv(transcript_file)

# Add a column that corresponds to participant for easy splitting for prosodic data
prosodic_data['Participant'] = prosodic_data['participant&question'].str.extract(r'^(PP?\d+)')
prosodic_data['Participant'] = prosodic_data['Participant'].str.lower()
# Make the entire transcript lower case
transcript_data['transcript'] = transcript_data['transcript'].str.lower()

In [40]:
print(prosodic_data.head)

<bound method NDFrame.head of     participant&question    duration    energy  min_pitch   max_pitch  \
0                   P1Q1   51.952125  0.015331  75.232657  396.635613   
1                   P1Q2   38.677312  0.015185  75.165527  397.613041   
2                   P1Q3   43.593896  0.014680  71.034761  395.930688   
3                   P1Q4   23.435813  0.008920  74.938673  248.733738   
4                   P1Q5   13.274833  0.003432  93.949854  263.669188   
..                   ...         ...       ...        ...         ...   
685               PP89Q1   76.333333  0.018540  74.344414  382.978887   
686               PP89Q2   60.628396  0.017723  72.955935  396.654629   
687               PP89Q3  131.118042  0.028155  72.806624  293.105036   
688               PP89Q4   85.464125  0.027269  65.836502  387.213912   
689               PP89Q5   49.306229  0.007684  77.520809  375.522678   

     mean_pitch   pitch_sd   pitch_abs  pitch_quant  pitchUvsVRatio  ...  \
0    127.989222  

In [41]:
# Splitting the participants into 5 folds
num_folds = 5

# Grab participant numbers from the scores csv file
interviews = scores_data['Participant'].unique()
participants = list(set([re.sub(r'^pp?|q\d+', '', item) for item in interviews]))
random.shuffle(participants)
participant_folds = [participants[i::num_folds] for i in range(num_folds)]

for i, fold in enumerate(participant_folds):
  print(f"Fold {i + 1}: {fold}")
  print("\tLength: ", len(fold))

# Grab all the correct interview names associated with each participant
interview_folds = []
for fold in participant_folds:
  interview_folds.append([item for num in fold for item in (f"p{num}", f"pp{num}")])

Fold 1: ['72', '56', '33', '27', '64', '6', '85', '57', '3', '14', '1', '63', '42', '25']
	Length:  14
Fold 2: ['30', '49', '34', '24', '29', '55', '16', '17', '61', '45', '74', '89', '37', '69']
	Length:  14
Fold 3: ['66', '83', '7', '77', '20', '35', '52', '84', '11', '47', '13', '73', '8', '15']
	Length:  14
Fold 4: ['22', '86', '81', '32', '43', '12', '78', '58', '71', '31', '59', '5', '65', '67']
	Length:  14
Fold 5: ['21', '62', '4', '50', '70', '79', '53', '44', '80', '60', '10', '76', '48']
	Length:  13


In [42]:
def get_data_splits(data, fold_number):
    """
    Split data into training, validation, and testing sets based on a specified fold.
    
    Parameters:
        data (DataFrame): The complete dataset.
        fold_number (int): The fold to use for testing (0-based index).
        
    Returns:
        tuple: (training_set, validation_set, testing_set)
    """
    # Quick check on fold number
    assert 0 <= fold_number < len(interview_folds), "Fold_number must be between 0 and len(folds) - 1"

    # Split the data
    test_set = data[data['Participant'].isin( interview_folds[fold_number] )]
    val_set = data[data['Participant'].isin( interview_folds[(fold_number + 1) % len(interview_folds)] )]
    train_set_parts = [item for i, fold in enumerate(interview_folds) if i not in [fold_number, (fold_number + 1) % len(interview_folds)] for item in fold]
    train_set = data[data['Participant'].isin(train_set_parts)]
    
    return train_set, val_set, test_set

train_set, val_set, test_set = get_data_splits(prosodic_data, 0)
print("Prosodic Data: ")
print(train_set.shape, " ", val_set.shape, " ", test_set.shape)
train_set, val_set, test_set = get_data_splits(scores_data, 0)
print("Scores Data: ")
print(train_set.shape, " ", val_set.shape, " ", test_set.shape)
# print(test_set.head())
train_set, val_set, test_set = get_data_splits(transcript_data, 0)
print("Transcript Data: ")
print(train_set.shape, " ", val_set.shape, " ", test_set.shape)
# print(test_set.head())

Prosodic Data: 
(410, 37)   (140, 37)   (140, 37)
Scores Data: 
(82, 3)   (28, 3)   (28, 3)
Transcript Data: 
(82, 4)   (28, 4)   (28, 4)


Extracting language features

In [None]:
# !pip install vaderSentiment


In [54]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch
from transformers import BertTokenizer, BertModel

nltk.download('punkt')  # Tokenizer
nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')  # POS Tagger
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon') # Vader

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gyani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gyani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\gyani\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gyani\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

Syntactic Vectorization with CountVectorizer


In [55]:

# Syntactic vectorizer: CountVectorizer

# Remove common stop words in english and ignore words that appear fewer than 2 times
vectorizer = CountVectorizer(stop_words='english', min_df=2) 
X = vectorizer.fit_transform(transcript_data['transcript'])

# Convert from sparse X matrix to a denser one for easy use
X_dense = X.toarray()
feature_names_count = vectorizer.get_feature_names_out()

word_count = pd.DataFrame(X_dense, columns=feature_names_count)

word_count.head()

Unnamed: 0,000,10,100,13,14,15,16,18,20,200,...,yep,yes,yo,yoga,york,young,younger,youngest,yup,zone
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Syntactic Vectorization with TFIDF Vectorizer


In [56]:

# Syntactic vectorizer: TFIDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', min_df=2)
tfidf_matrix = tfidf.fit_transform(transcript_data['transcript'])
feature_names_tf = tfidf.get_feature_names_out()
tfidf_count = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names_tf)

tfidf_count.head()

Unnamed: 0,000,10,100,13,14,15,16,18,20,200,...,yep,yes,yo,yoga,york,young,younger,youngest,yup,zone
0,0.068513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.031465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Statistical Features (word count and average word length)


In [57]:

# Statistical Features (word count and average word length)

# Word Count for the entire interview
features = pd.DataFrame()
features['word count'] = transcript_data['transcript'].apply(lambda x: len(x.split()))

# Average word length for the interview
features['avg word length'] = transcript_data['transcript'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

features.head()

Unnamed: 0,word count,avg word length
0,613,4.438825
1,1118,4.510733
2,751,4.528628
3,717,4.281729
4,645,4.688372


Part of speech tagging 

In [59]:
features['pos tagging'] = transcript_data['transcript'].apply(lambda x: pos_tag(word_tokenize(x)))
features['pos tagging'].head()
# NN: Noun singular
# VB: Verb base form
# JJ: Adjective
# RB: Adverb
# DT: Determiner
# IN: Preposition or subordination conjunction

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gyani\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


0    [(interviewer, NN), (:, :), (so, RB), (how, WR...
1    [(interviewer, NN), (:, :), (so, RB), (how, WR...
2    [(interviewer, NN), (:, :), (so, RB), (tell, V...
3    [(interviewer, NN), (:, :), (so, RB), (how, WR...
4    [(interviewer, NN), (:, :), (how, WRB), (are, ...
Name: pos tagging, dtype: object

Sentiment Analysis with Vader


In [61]:
# Sentiment Analysis with Vader
analyzer = SentimentIntensityAnalyzer()
features['sentiment'] = transcript_data['transcript'].apply(lambda x: analyzer.polarity_scores(x))

features['sentiment'].head()

0    {'neg': 0.013, 'neu': 0.859, 'pos': 0.128, 'co...
1    {'neg': 0.026, 'neu': 0.852, 'pos': 0.122, 'co...
2    {'neg': 0.02, 'neu': 0.874, 'pos': 0.105, 'com...
3    {'neg': 0.027, 'neu': 0.842, 'pos': 0.131, 'co...
4    {'neg': 0.038, 'neu': 0.838, 'pos': 0.124, 'co...
Name: sentiment, dtype: object

Word Embeddings with BERT


In [62]:

# Word embeddings with BERT (Hugging Face)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [64]:
# Separate interveiw into sentences
def process_transcript(text):
  sentences = re.split(r'(?<=\|)', text)
  cleaned_sentences = [
    re.sub(r'^(interviewer:|interviewee:)\s*', '', s.strip().replace('|', '')) for s in sentences if s.strip()
  ]
  return cleaned_sentences

transcript_data['Processed Transcript'] = transcript_data['transcript'].apply(process_transcript)

# Tokenize the text
def get_bert_embeddings(sentences):
  embeddings = []
  for sentence in sentences:
    # Tokenize and get input Id and attention mask
    inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=20)

    # Use BERT
    with torch.no_grad():
      outputs = model(**inputs)

    # Pool output
    cls_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    embeddings.append(cls_embedding)

  # Return an array that is the same size
  return np.array(embeddings).flatten()[:6912]

features['word embeddings'] = transcript_data['Processed Transcript'].apply(get_bert_embeddings)

features['word embeddings'].head()

0    [0.20702128, -0.34244847, 0.10633583, 0.048458...
1    [0.20338869, -0.42223406, -0.0042748363, -0.00...
2    [0.3116174, -0.17067035, -0.094398685, 0.18208...
3    [0.21430025, -0.47013944, -0.07054973, 0.12508...
4    [0.17230605, -0.37611702, -0.10042618, -0.0261...
Name: word embeddings, dtype: object

Language Features Extracted:
- Syntactic Vectorization with CountVectorizer
- Syntactic Vectorization with TFIDF Vectorizer
- Word Count
- Average Word Length
- Part of Speech Tagging
- Sentiment Analysis with Vader
- Word Embedding with BERT
  
A total of 7 distinctive features.

The human readable features are: CountVectorizer because it is a count of the words throughout the interview, the Statistical Features, Part of Speech Tagging (with a little extra interpretation), and Sentiment Analysis with Vader.

Language feature selection part


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Define the features and the target outcomes
X = features[['word count', 'avg word length', 'pos tagging', 'sentiment', 'word embeddings']]
y = outcomes  # Assuming 'outcomes' is a predefined variable containing the target outcomes

# Apply SelectKBest to extract the top k features
k = 5  # Number of top features to select
selector = SelectKBest(score_func=f_classif, k=k)
X_new = selector.fit_transform(X, y)

# Get the selected feature names
selected_features = X.columns[selector.get_support(indices=True)]

# Display the selected features and their scores
feature_scores = selector.scores_[selector.get_support()]
for feature, score in zip(selected_features, feature_scores):
    print(f"Feature: {feature}, Score: {score}")

# Discuss findings
# The selected features are the most relevant to the considered outcomes based on the ANOVA F-value.
# These features can provide actionable insights to the user by highlighting the most important aspects
# of the language used in the interviews. For example, if 'word count' is positively associated with
# successful outcomes, users can focus on increasing the length of their responses. Similarly, if
# 'sentiment' is negatively associated with successful outcomes, users can work on improving the
# positivity of their language.
