# Notebook for free speech feature extraction 

In [None]:
import pandas as pd
from textblob import TextBlob
from statistics import mean, stdev
import numpy as np
import nltk
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy import spatial
from scipy import stats
import string
import math
from pycorenlp import StanfordCoreNLP
from scipy.stats import linregress
import torch
from transformers import BertTokenizer, BertModel

# only if it has not been downloaded, uncomment:
# nltk.download('averaged_perceptron_tagger')

In [None]:
# read in the data 
# a file called "free_speech.xlsx"

# COLUMNS: 
# - FILE (a unique designator), 
# - TRANSCRIPT (human transcribed transcript)
# - GROUP (1 = control, 2 = aMCI, 3 = AD)

free_speech = pd.read_excel('free_speech.xlsx')

In [None]:
free_speech['participant_wc'] = free_speech.apply(lambda row: len(row['TRANSCRIPT'].split()), axis=1)
free_speech['participant_types'] = free_speech.apply(lambda row: len(set(row['TRANSCRIPT'].split())), axis=1)


In [None]:
type_token_ratio = []

for index, row in free_speech.iterrows():
    type_token_ratio.append(row['participant_types']/row['participant_wc'])
    
free_speech['participant_type_token_ratio'] = type_token_ratio


In [None]:
brunets_index = []

for index, row in free_speech.iterrows():
    # log(wc**types**-0.165) = (types**-0.165)*log(wc)
    brunets_index.append(row['participant_types']**(-0.165)*math.log(row['participant_wc']))

free_speech['participant_brunets_index'] = brunets_index


In [None]:
# sentiment of all sentences per participant

participant_mean_sentiment = []
participant_max_sentiment = []
participant_min_sentiment = []
participant_stdv_sentiment = []

for transcript in free_speech.TRANSCRIPT:
    blob = TextBlob(transcript)
    all_sentiments = [sentence.sentiment.polarity for sentence in blob.sentences]
    participant_mean_sentiment.append(mean(all_sentiments))
    participant_max_sentiment.append(max(all_sentiments))
    participant_min_sentiment.append(min(all_sentiments))
    participant_stdv_sentiment.append(stdev(all_sentiments))
    
free_speech['participant_mean_sentiment'] = participant_mean_sentiment
free_speech['participant_max_sentiment'] = participant_max_sentiment
free_speech['participant_min_sentiment'] = participant_min_sentiment
free_speech['participant_stdv_sentiment'] = participant_stdv_sentiment


In [None]:
# parts of speech frequencies per participant

participant_noun_freq = []
participant_determiner_freq = []
participant_preposition_freq = []
participant_base_verb_freq = []
participant_pasttense_verb_freq = []
participant_gerund_presentparticiple_verb_freq = []
participant_pastparticiple_verb_freq = []
participant_non3rdpersonsingularpresent_verb_freq = []
participant_3rdpersonsingularpresent_verb_freq = []
participant_TOTAL_verb_freq = []
participant_to_freq = []
participant_adverb_freq = []
participant_adjective_freq = []
participant_modal_freq = []
participant_coordinating_conjunctions_freq = []
participant_cardinals_freq = []
participant_particle_freq = []
participant_personal_pronoun_freq = []
participant_wh_adverbs_freq = []
participant_possessive_pronoun_freq = []
participant_wh_determiner_freq = []
participant_predeterminer_freq = []
participant_interjection_freq = []
participant_existential_there_freq = []
participant_wh_pronoun_freq = []
participant_content_density = []

for transcript, wc in zip(free_speech.TRANSCRIPT, free_speech.participant_wc):
    
    blob = TextBlob(transcript)
    
    nouns = 0
    determiners = 0
    prepositions = 0
    base_verbs = 0
    pasttense_verbs = 0
    verb_gerund_presentparticiple = 0
    verb_pastparticiple = 0
    verb_non3rdpersonsingularpresent = 0
    verb_3rdpersonsingularpresent = 0
    tos = 0
    adverbs = 0
    adjectives = 0
    modals = 0
    coordinating_conjunctions = 0
    cardinals = 0
    particles = 0
    personal_pronouns = 0
    wh_adverbs = 0
    possessive_pronouns = 0
    wh_determiners = 0
    predeterminers = 0
    interjections = 0
    existential_theres = 0
    wh_pronouns = 0
    
    for word, tag in blob.tags:
        
        #all nouns grouped together: singular, plural, proper singular, proper plural 
        if tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS':
            nouns += 1
        elif tag == 'DT':
            determiners += 1
        elif tag == 'IN':
            prepositions += 1
        elif tag == 'VB':
            base_verbs +=1
        elif tag == 'VBD':
            pasttense_verbs += 1
        elif tag == 'VBG':
            verb_gerund_presentparticiple += 1
        elif tag == 'VBN':
            verb_pastparticiple += 1
        elif tag == 'VBP':
            verb_non3rdpersonsingularpresent += 1
        elif tag == 'VBZ':
            verb_3rdpersonsingularpresent += 1
        elif tag == 'TO':
            tos += 1
        
        #all adverbs grouped together: normal, comparative, superlative
        elif tag == 'RB' or tag == 'RBR' or tag == 'RBS':
            adverbs += 1
        
        #all adjectives grouped together: normal, comparative, superlative
        elif tag == 'JJ' or tag == 'JJR' or tag == 'JJS':
            adjectives += 1
        elif tag == 'MD':
            modals += 1
        elif tag == 'CC':
            coordinating_conjunctions += 1
        elif tag == 'RP':
            particles += 1
        elif tag == 'CD':
            cardinals += 1
        elif tag == 'PRP':
            personal_pronouns += 1
        
        #when
        elif tag == 'WRB':
            wh_adverbs += 1
        elif tag == 'PRP$':
            possessive_pronouns += 1
        
        #that
        elif tag == 'WDT':
            wh_determiners += 1
        elif tag == 'PDT':
            predeterminers += 1
        elif tag == 'UH':
            interjections += 1
        elif tag == 'EX':
            existential_theres += 1
        
        #who, what, whose
        elif tag == 'WP' or tag == 'WP$':
            wh_pronouns += 1
            
    total_verbs = base_verbs+pasttense_verbs+verb_gerund_presentparticiple+verb_pastparticiple+verb_non3rdpersonsingularpresent+verb_3rdpersonsingularpresent

    participant_noun_freq.append(nouns/wc)
    participant_determiner_freq.append(determiners/wc)
    participant_preposition_freq.append(prepositions/wc)
    participant_base_verb_freq.append(base_verbs/wc)
    participant_pasttense_verb_freq.append(pasttense_verbs/wc)
    participant_gerund_presentparticiple_verb_freq.append(verb_gerund_presentparticiple/wc)
    participant_pastparticiple_verb_freq.append(verb_pastparticiple/wc)
    participant_non3rdpersonsingularpresent_verb_freq.append(verb_non3rdpersonsingularpresent/wc)
    participant_3rdpersonsingularpresent_verb_freq.append(verb_3rdpersonsingularpresent/wc)
    participant_TOTAL_verb_freq.append(total_verbs/wc)
    participant_to_freq.append(tos/wc)
    participant_adverb_freq.append(adverbs/wc)
    participant_adjective_freq.append(adjectives/wc)
    participant_modal_freq.append(modals/wc)
    participant_coordinating_conjunctions_freq.append(coordinating_conjunctions/wc)
    participant_cardinals_freq.append(cardinals/wc)
    participant_particle_freq.append(particles/wc)
    participant_personal_pronoun_freq.append(personal_pronouns/wc)
    participant_wh_adverbs_freq.append(wh_adverbs/wc)
    participant_possessive_pronoun_freq.append(possessive_pronouns/wc)
    participant_wh_determiner_freq.append(wh_determiners/wc)
    participant_predeterminer_freq.append(predeterminers/wc)
    participant_interjection_freq.append(interjections/wc)
    participant_existential_there_freq.append(existential_theres/wc)
    participant_wh_pronoun_freq.append(wh_pronouns/wc)
    participant_content_density.append((total_verbs+nouns+adjectives+adverbs)/wc)
    
    
free_speech['participant_noun_freq'] = participant_noun_freq
free_speech['participant_determiner_freq'] = participant_determiner_freq
free_speech['participant_preposition_freq'] = participant_preposition_freq
free_speech['participant_base_verb_freq'] = participant_base_verb_freq
free_speech['participant_pasttense_verb_freq'] = participant_pasttense_verb_freq
free_speech['participant_gerund_presentparticiple_verb_freq'] = participant_gerund_presentparticiple_verb_freq
free_speech['participant_pastparticiple_verb_freq'] = participant_pastparticiple_verb_freq
free_speech['participant_non3rdpersonsingularpresent_verb_freq'] = participant_non3rdpersonsingularpresent_verb_freq
free_speech['participant_3rdpersonsingularpresent_verb_freq'] = participant_3rdpersonsingularpresent_verb_freq
free_speech['participant_TOTAL_verb_freq'] = participant_TOTAL_verb_freq
free_speech['participant_to_freq'] = participant_to_freq
free_speech['participant_adverb_freq'] = participant_adverb_freq
free_speech['participant_adjective_freq'] = participant_adjective_freq
free_speech['participant_modal_freq'] = participant_modal_freq
free_speech['participant_coordinating_conjunctions_freq'] = participant_coordinating_conjunctions_freq
free_speech['participant_cardinals_freq'] = participant_cardinals_freq
free_speech['participant_particle_freq'] = participant_particle_freq
free_speech['participant_personal_pronoun_freq'] = participant_personal_pronoun_freq
free_speech['participant_wh_adverbs_freq'] = participant_wh_adverbs_freq
free_speech['participant_possessive_pronoun_freq'] = participant_possessive_pronoun_freq
free_speech['participant_wh_determiner_freq'] = participant_wh_determiner_freq
free_speech['participant_predeterminer_freq'] = participant_predeterminer_freq
free_speech['participant_interjection_freq'] = participant_interjection_freq
free_speech['participant_existential_there_freq'] = participant_existential_there_freq
free_speech['participant_wh_pronoun_freq'] = participant_wh_pronoun_freq
free_speech['participant_content_density'] = participant_content_density

In [None]:
# COHERENCE FUNCTIONS

# load embeddings (can be swapped for various types) - These are the "don't count, predict!" embeddings: https://zenodo.org/record/2635544#.YqpI7BPMJQI
embeddings = {}
input_file = open('EN-wform.w.5.cbow.neg10.400.subsmpl.txt', 'r') 

for line in input_file:
    tokens = line.split('\t')
    tokens[-1] = tokens[-1].strip()
    for i in range(1, len(tokens)):
        tokens[i] = float(tokens[i])
    embeddings[tokens[0]] = tokens[1:-1]


# GOOGLE NEWS W2V: 
# w2v = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

def vector_sum(vectors):
    '''
    given a list of vectors for a sentence, return the sum of all vectors
    '''
    n = len(vectors)
    d = len(vectors[0])

    #create an array initialized to 0 of the same length of the word embeddings
    s = []
    for i in range(d):
        s.append(0)
    s = np.array(s)

    #add each word embedding to the zero vector
    for vector in vectors:
        s = s + np.array(vector)

    return list(s)

def get_cosine(text1, text2):
    text1_embeddings = []
    text2_embeddings = []
    
    for word in text1.split():
        lemma = str(word).translate(str.maketrans("", "", string.punctuation)).lower()
        if lemma in embeddings:
            text1_embeddings.append(embeddings[lemma]) # w2v[lemma] if using the google news vectors
        else:
            print('this word has no embedding: ', lemma)
            print(text1)
            
    for word in text2.split():
        lemma = str(word).translate(str.maketrans("", "", string.punctuation)).lower()
        if lemma in embeddings:
            text2_embeddings.append(embeddings[lemma])
        else:
            print('this word has no embedding: ', lemma)
            print(text2)
            
    text1_sum = vector_sum(text1_embeddings)
    text2_sum = vector_sum(text2_embeddings)
    
    cos = 1 - spatial.distance.cosine(text1_sum, text2_sum)
    
    return cos

def get_ngrams_len(text, n):
    '''
    doing this a complicated way but could just do len(raudio.split()) - 3
    '''
    n_grams = ngrams(word_tokenize(text), n)
    ngrams_list = [' '.join(grams) for grams in n_grams]
    return len(ngrams_list)


def get_ngrams(text, n):
    n_grams = ngrams(word_tokenize(text), n)
    return [' '.join(grams) for grams in n_grams]


def get_slope(nums):

    x = range(len(nums))
    y = nums

    slope, intercept, r_value, p_value, std_err = linregress(x, y)

    return slope

In [None]:
# repeat with different ngram lengths -> a range of [3 - 8] 

participant_overall_mean_coherence_4 = []
participant_overall_std_coherence_4 = []
participant_overall_min_coherence_4 = []
participant_overall_max_coherence_4 = []
participant_overall_slope_coherence_4 = []

for transcript in free_speech.TRANSCRIPT:
    
    cosines = []
    
    ## CAN OPTIONALLY REMOVE STOP WORDS FIRST ##
    
    fourgrams = get_ngrams(all_utterances, 4)
    
    for i in range(len(fourgrams)-1):
        cosines.append(get_cosine(fourgrams[i], fourgrams[i+1]))
    
    participant_overall_mean_coherence_4.append(np.array(cosines).mean())
    participant_overall_std_coherence_4.append(np.array(cosines).std())
    participant_overall_min_coherence_4.append(min(cosines))
    participant_overall_min_coherence_4.append(max(cosines))
    participant_overall_slope_coherence_4.append(get_slope(cosines))
    
free_speech['participant_DCP_mean_coherence_4'] = participant_overall_mean_coherence_4
free_speech['participant_DCP_std_coherence_4'] = participant_overall_std_coherence_4
free_speech['participant_DCP_min_coherence_4'] = participant_overall_min_coherence_4
free_speech['participant_DCP_max_coherence_4'] = participant_overall_max_coherence_4
free_speech['participant_DCP_slope_coherence_4'] = participant_overall_slope_coherence_4


In [None]:
# coherence between interviewer MAIN question and interviewee responses

main_question = 'Can you think of a fun or particularly memorable event from when you were a child that you can tell me about?'

slopes = []
avgs = []
stds = []

for transcript in free_speech.TRANSCRIPT:
    
    cosines = []
    fourgrams = get_ngrams(transcript, 4)
    for i in range(len(fourgrams)):
        cosines.append(get_cosine(fourgrams[i], main_question))
    
    avgs.append(np.array(cosines).mean())
    stds.append(np.array(cosines).std())
    
    slope, _, _, _, std_err = stats.linregress(range(len(cosines)),cosines)
    
    slopes.append(slope)
    
free_speech['participant_coherence_to_Q_slope'] = slopes
free_speech['participant_coherence_to_Q_avg'] = avgs
free_speech['participant_coherence_to_Q_std'] = stds


In [None]:
# BERT

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_cosine_BERT(text1, text2):

    tokenized_text1 = tokenizer.encode(text1)
    tokenized_text2 = tokenizer.encode(text2)

    #convert indexed tokens in a PyTorch tensor
    input_ids1 = torch.tensor(tokenized_text1).unsqueeze(0)
    input_ids2 = torch.tensor(tokenized_text2).unsqueeze(0)
  
    #run the input tensor through the BertModel
    #see text in above cell for what is contained in outputs variable
    outputs1 = model(input_ids1)
    outputs2 = model(input_ids2)

    #get the last_hidden_state
    last_hidden_state1 = outputs1[0]
    last_hidden_state2 = outputs2[0]

    #last hidden state is dimension (batch_size, sequence_length, hidden_size)
    #we have one batch so grab this single batch - this_batch is a tensor for each token in tokenized_text
    this_batch1 = last_hidden_state1[0]
    this_batch2 = last_hidden_state2[0]
  
    #now get the 768 dimension vector for the CLS token (the first in the list) 
    cls_vector1 = this_batch1[0]
    cls_vector2 = this_batch2[0]
    
    cos = 1 - spatial.distance.cosine(cls_vector1.detach().numpy(), cls_vector2.detach().numpy())
    
    return cos

# again, redo this with different ngram sizes in [3-8]

participant_BERT_mean_coherence_4 = []
participant_BERT_std_coherence_4 = []
participant_BERT_min_coherence_4 = []
participant_BERT_max_coherence_4 = []
participant_BERT_slope_coherence_4 = []

for transcript in free_speech.TRANSCRIPT:
    cosines = []
    fourgrams = get_ngrams(transcript, 4)
    for i in range(len(fourgrams)-1):
        cosines.append(get_cosine_BERT(fourgrams[i], fourgrams[i+1]))

    participant_BERT_mean_coherence_4.append((np.array(cosines).mean())
    participant_BERT_std_coherence_4.append(np.array(cosines).std())
    participant_BERT_min_coherence_4.append(min(cosines))
    participant_BERT_max_coherence_4.append(max(cosines))
    participant_BERT_slope_coherence_4.append(get_slope(cosines))

free_speech['participant_BERT_mean_coherence_4'] = participant_BERT_mean_coherence_4
free_speech['participant_BERT_std_coherence_4'] = participant_BERT_std_coherence_4
free_speech['participant_BERT_min_coherence_4'] = participant_BERT_min_coherence_4
free_speech['participant_BERT_max_coherence_4'] = participant_BERT_max_coherence_4
free_speech['participant_BERT_slope_coherence_4'] = participant_BERT_slope_coherence_4


In [None]:
# now we want to parse each sentence and get the max parse depth of each

# first download stanford corenlp https://stanfordnlp.github.io/CoreNLP/download.html
# then run the server inside the folder:

# export CLASSPATH="`find . -name '*.jar'`"
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer [port?] 

nlp = StanfordCoreNLP('http://localhost:9000')

all_max_depths = []
#for each utterance
for utterance in free_speech.TRANSCRIPT:
    
    sentences = sent_tokenize(utterance)
    
    # for sentence in each utterance
    max_depths = []
    for text in sentences:
        bracket_counter = []
        
        #get the parse
        output = nlp.annotate(text, properties={
            'annotators': 'parse',
            'outputFormat': 'json'
        })
        
        try:
            parse = output['sentences'][0]['parse'].split('\n')
        except:
            all_max_depths.append('sentence too long')
            continue
            
        for line in parse:
            bracket_counter.append(line.count(')'))
            
        #get the furthest depth of this setence
        max_depths.append(max(bracket_counter)-1)
    
    #record max depths of all sentences in an utterance
    all_max_depths.append(max_depths)
    
# list of maximum depth per sentence in utterance (not used as a feature)
free_speech['participant_parse_depth_per_sentence'] = all_max_depths

participant_parse_max_depth = []
participant_parse_min_depth = []
participant_parse_mean_depth = []
participant_parse_std_depth = []

for listofdepths in all_max_depths:
    try:
        listofdepths.remove('sentence too long')
    except:
        pass
    participant_parse_max_depth.append(max(listofdepths))
    participant_parse_min_depth.append(min(listofdepths))
    participant_parse_mean_depth.append(np.array(listofdepths).mean())
    participant_parse_std_depth.append(np.array(listofdepths).std())
    
free_speech['participant_parse_max_depth'] = participant_parse_max_depth
free_speech['participant_parse_min_depth'] = participant_parse_min_depth
free_speech['participant_parse_mean_depth'] = participant_parse_mean_depth
free_speech['participant_parse_std_depth'] = participant_parse_std_depth


In [None]:
participant_ums_or_ahs = []
participant_ums_or_ahs_freq = []

for utterance, wc in zip(free_speech.TRANSCRIPT, free_speech.participant_wc):
    total_ums_ahs = 0
    for word in utterance.split():
        if word.lower() == 'um' or word.lower() == 'ah' or word.lower() == 'uh':
            total_ums_ahs += 1

    participant_ums_or_ahs.append(total_ums_ahs)
    participant_ums_or_ahs_freq.append(total_ums_ahs/wc)
    
free_speech['participant_ums_or_ahs_count'] = participant_ums_or_ahs
free_speech['participant_ums_or_ahs_freq'] = participant_ums_or_ahs_freq


In [None]:
free_speech.to_excel('free_speech_features.xlsx')

# Computing the f statistics for individual features

In [None]:
for column in free_speech.columns:
    if column != 'FILE' and column != 'TRANSCRIPT' and column != 'GROUP' and column != 'participant_parse_depth_per_sentence':
        
        ones = free_speech[free_speech.GROUP==1]
        twos = free_speech[free_speech.GROUP==2]
        threes = free_speech[free_speech.GROUP==3]
        twosthrees = free_speech[(free_speech.GROUP==2)|(free_speech.GROUP==3)]
        
        f1, p1 = stats.f_oneway(ones[column].dropna(), twos[column].dropna(), threes[column].dropna())
        f2, p2 = stats.f_oneway(ones[column].dropna(), twos[column].dropna())
        f3, p3 = stats.f_oneway(ones[column].dropna(), threes[column].dropna())
        f4, p4 = stats.f_oneway(twos[column].dropna(), threes[column].dropna())
        f5, p5 = stats.f_oneway(ones[column].dropna(), twosthrees[column].dropna())

        print(column, 'overall', (f1,p1), '1vs2', (f2,p2),'1vs3', (f3,p3),'2vs3', (f4,p4), '1vs23', (f5,p5))