# Notebook for free speech feature extraction 

In [1]:
import json
import math
import nltk
import torch
import string
import requests
import itertools
import numpy as np
import pandas as pd
import networkx as nx
from scipy import stats
import tensorflow as tf
from scipy import spatial
from nltk.util import ngrams
import tensorflow_hub as hub
from textblob import TextBlob
import matplotlib.pyplot as plt
from scipy.stats import linregress
from statistics import mean, stdev
from pycorenlp import StanfordCoreNLP
from gensim.models import KeyedVectors
from transformers import BertTokenizer, BertModel
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sentence_transformers import SentenceTransformer
# only if it has not been downloaded, uncomment:
# nltk.download('averaged_perceptron_tagger')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read in the data 
# COLUMNS: 
# - FILE (a unique designator), 
# - TRANSCRIPT (human transcribed transcript)
# - GROUP (1 = control, 2 = aMCI, 3 = AD)

# DATA = pd.read_excel('Free_Speech.xlsx')
# DATA = pd.read_excel
makecode = pd.read_excel('/Users/emilydoherty/Library/CloudStorage/OneDrive-UCB-O365/Emily_Papers/iSAT_discoursepaper2023/Cleaned Transcripts/Makecode/Clean/makecode_master.xlsx')
weights = pd.read_excel('/Users/emilydoherty/Library/CloudStorage/OneDrive-UCB-O365/Emily_Papers/iSAT_discoursepaper2023/Cleaned Transcripts/Weights/Clean/weights_master.xlsx')
# change the column names to match above
makecode = makecode.rename(columns={'Text':'TRANSCRIPT'}).astype(str)
weights = weights.rename(columns={'Text':'TRANSCRIPT'}).astype(str)

#pick makecode or weights 
DATA = makecode
# DATA=weights
#DATA = DATA.dropna(subset=['TRANSCRIPT'])

### token and type count

In [None]:
DATA['participant_wc'] = DATA.apply(lambda row: len([x for x in row['TRANSCRIPT'].split() if x != '']), axis=1)
DATA['participant_types'] = DATA.apply(lambda row: len(set([x for x in row['TRANSCRIPT'].split() if x != ''])), axis=1)

### type token ratio

In [None]:
type_token_ratio = []

for index, row in DATA.iterrows():
    type_token_ratio.append(row['participant_types']/row['participant_wc'])
    
DATA['participant_type_token_ratio'] = type_token_ratio

### brunet's index

In [None]:
brunets_index = []

for index, row in DATA.iterrows():
    # log(wc**types**-0.165) = (types**-0.165)*log(wc)
    brunets_index.append(row['participant_types']**(-0.165)*math.log(row['participant_wc']))

DATA['participant_brunets_index'] = brunets_index

### count of ums ahs etc. 

In [None]:
participant_ums_or_ahs = []
participant_ums_or_ahs_freq = []

for utterance, wc in zip(DATA.TRANSCRIPT, DATA.participant_wc):
    total_ums_ahs = 0
    for word in utterance.split():
        # can include more here!
        if word.lower() == 'um' or word.lower() == 'ah' or word.lower() == 'uh':
            total_ums_ahs += 1

    participant_ums_or_ahs.append(total_ums_ahs)
    participant_ums_or_ahs_freq.append(total_ums_ahs/wc)
    
DATA['participant_ums_or_ahs_count'] = participant_ums_or_ahs
DATA['participant_ums_or_ahs_freq'] = participant_ums_or_ahs_freq


### sentiment of all sentences per participant

In [None]:
import nltk
nltk.download('punkt')

participant_mean_sentiment = []
participant_max_sentiment = []
participant_min_sentiment = []
participant_stdv_sentiment = []

for transcript in DATA.TRANSCRIPT:
    blob = TextBlob(transcript)
    all_sentiments = [sentence.sentiment.polarity for sentence in blob.sentences]
    participant_mean_sentiment.append(mean(all_sentiments))
    participant_max_sentiment.append(max(all_sentiments))
    participant_min_sentiment.append(min(all_sentiments))
    try:
        participant_stdv_sentiment.append(stdev(all_sentiments))
    except:
        participant_stdv_sentiment.append(0)

DATA['participant_mean_sentiment'] = participant_mean_sentiment
DATA['participant_max_sentiment'] = participant_max_sentiment
DATA['participant_min_sentiment'] = participant_min_sentiment
DATA['participant_stdv_sentiment'] = participant_stdv_sentiment

### parts of speech frequencies per participant + content density

In [None]:
nltk.download('averaged_perceptron_tagger')
participant_noun_freq = []
participant_determiner_freq = []
participant_preposition_freq = []
participant_base_verb_freq = []
participant_pasttense_verb_freq = []
participant_gerund_presentparticiple_verb_freq = []
participant_pastparticiple_verb_freq = []
participant_non3rdpersonsingularpresent_verb_freq = []
participant_3rdpersonsingularpresent_verb_freq = []
participant_TOTAL_verb_freq = []
participant_to_freq = []
participant_adverb_freq = []
participant_adjective_freq = []
participant_modal_freq = []
participant_coordinating_conjunctions_freq = []
participant_cardinals_freq = []
participant_particle_freq = []
participant_personal_pronoun_freq = []
participant_wh_adverbs_freq = []
participant_possessive_pronoun_freq = []
participant_wh_determiner_freq = []
participant_predeterminer_freq = []
participant_interjection_freq = []
participant_existential_there_freq = []
participant_wh_pronoun_freq = []
participant_content_density = []

for transcript, wc in zip(DATA.TRANSCRIPT, DATA.participant_wc):
    blob = TextBlob(transcript)
    nouns = 0
    determiners = 0
    prepositions = 0
    base_verbs = 0
    pasttense_verbs = 0
    verb_gerund_presentparticiple = 0
    verb_pastparticiple = 0
    verb_non3rdpersonsingularpresent = 0
    verb_3rdpersonsingularpresent = 0
    tos = 0
    adverbs = 0
    adjectives = 0
    modals = 0
    coordinating_conjunctions = 0
    cardinals = 0
    particles = 0
    personal_pronouns = 0
    wh_adverbs = 0
    possessive_pronouns = 0
    wh_determiners = 0
    predeterminers = 0
    interjections = 0
    existential_theres = 0
    wh_pronouns = 0
    
    for word, tag in blob.tags:
        #all nouns grouped together: singular, plural, proper singular, proper plural 
        if tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS':
            nouns += 1
        elif tag == 'DT':
            determiners += 1
        elif tag == 'IN':
            prepositions += 1
        elif tag == 'VB':
            base_verbs +=1
        elif tag == 'VBD':
            pasttense_verbs += 1
        elif tag == 'VBG':
            verb_gerund_presentparticiple += 1
        elif tag == 'VBN':
            verb_pastparticiple += 1
        elif tag == 'VBP':
            verb_non3rdpersonsingularpresent += 1
        elif tag == 'VBZ':
            verb_3rdpersonsingularpresent += 1
        elif tag == 'TO':
            tos += 1
        #all adverbs grouped together: normal, comparative, superlative
        elif tag == 'RB' or tag == 'RBR' or tag == 'RBS':
            adverbs += 1
        #all adjectives grouped together: normal, comparative, superlative
        elif tag == 'JJ' or tag == 'JJR' or tag == 'JJS':
            adjectives += 1
        elif tag == 'MD':
            modals += 1
        elif tag == 'CC':
            coordinating_conjunctions += 1
        elif tag == 'RP':
            particles += 1
        elif tag == 'CD':
            cardinals += 1
        elif tag == 'PRP':
            personal_pronouns += 1
        #when
        elif tag == 'WRB':
            wh_adverbs += 1  
        elif tag == 'PRP$':
            possessive_pronouns += 1
        #that
        elif tag == 'WDT':
            wh_determiners += 1
        elif tag == 'PDT':
            predeterminers += 1
        elif tag == 'UH':
            interjections += 1
        elif tag == 'EX':
            existential_theres += 1
        #who, what, whose
        elif tag == 'WP' or tag == 'WP$':
            wh_pronouns += 1
            
    total_verbs = base_verbs+pasttense_verbs+verb_gerund_presentparticiple+verb_pastparticiple+verb_non3rdpersonsingularpresent+verb_3rdpersonsingularpresent
    participant_noun_freq.append(nouns/wc)
    participant_determiner_freq.append(determiners/wc)
    participant_preposition_freq.append(prepositions/wc)
    participant_base_verb_freq.append(base_verbs/wc)
    participant_pasttense_verb_freq.append(pasttense_verbs/wc)
    participant_gerund_presentparticiple_verb_freq.append(verb_gerund_presentparticiple/wc)
    participant_pastparticiple_verb_freq.append(verb_pastparticiple/wc)
    participant_non3rdpersonsingularpresent_verb_freq.append(verb_non3rdpersonsingularpresent/wc)
    participant_3rdpersonsingularpresent_verb_freq.append(verb_3rdpersonsingularpresent/wc)
    participant_TOTAL_verb_freq.append(total_verbs/wc)
    participant_to_freq.append(tos/wc)
    participant_adverb_freq.append(adverbs/wc)
    participant_adjective_freq.append(adjectives/wc)
    participant_modal_freq.append(modals/wc)
    participant_coordinating_conjunctions_freq.append(coordinating_conjunctions/wc)
    participant_cardinals_freq.append(cardinals/wc)
    participant_particle_freq.append(particles/wc)
    participant_personal_pronoun_freq.append(personal_pronouns/wc)
    participant_wh_adverbs_freq.append(wh_adverbs/wc)
    participant_possessive_pronoun_freq.append(possessive_pronouns/wc)
    participant_wh_determiner_freq.append(wh_determiners/wc)
    participant_predeterminer_freq.append(predeterminers/wc)
    participant_interjection_freq.append(interjections/wc)
    participant_existential_there_freq.append(existential_theres/wc)
    participant_wh_pronoun_freq.append(wh_pronouns/wc)
    participant_content_density.append((total_verbs+nouns+adjectives+adverbs)/wc)
    
    
DATA['participant_noun_freq'] = participant_noun_freq
DATA['participant_determiner_freq'] = participant_determiner_freq
DATA['participant_preposition_freq'] = participant_preposition_freq
DATA['participant_base_verb_freq'] = participant_base_verb_freq
DATA['participant_pasttense_verb_freq'] = participant_pasttense_verb_freq
DATA['participant_gerund_presentparticiple_verb_freq'] = participant_gerund_presentparticiple_verb_freq
DATA['participant_pastparticiple_verb_freq'] = participant_pastparticiple_verb_freq
DATA['participant_non3rdpersonsingularpresent_verb_freq'] = participant_non3rdpersonsingularpresent_verb_freq
DATA['participant_3rdpersonsingularpresent_verb_freq'] = participant_3rdpersonsingularpresent_verb_freq
DATA['participant_TOTAL_verb_freq'] = participant_TOTAL_verb_freq
DATA['participant_to_freq'] = participant_to_freq
DATA['participant_adverb_freq'] = participant_adverb_freq
DATA['participant_adjective_freq'] = participant_adjective_freq
DATA['participant_modal_freq'] = participant_modal_freq
DATA['participant_coordinating_conjunctions_freq'] = participant_coordinating_conjunctions_freq
DATA['participant_cardinals_freq'] = participant_cardinals_freq
DATA['participant_particle_freq'] = participant_particle_freq
DATA['participant_personal_pronoun_freq'] = participant_personal_pronoun_freq
DATA['participant_wh_adverbs_freq'] = participant_wh_adverbs_freq
DATA['participant_possessive_pronoun_freq'] = participant_possessive_pronoun_freq
DATA['participant_wh_determiner_freq'] = participant_wh_determiner_freq
DATA['participant_predeterminer_freq'] = participant_predeterminer_freq
DATA['participant_interjection_freq'] = participant_interjection_freq
DATA['participant_existential_there_freq'] = participant_existential_there_freq
DATA['participant_wh_pronoun_freq'] = participant_wh_pronoun_freq
DATA['participant_content_density'] = participant_content_density

### coherence features

#### First, load the embeddings you want to compute (comment out the ones you are not currently computing for memory purposes)

In [4]:
## Comment out anything you don't want to load for memory purposes! ##

# USE model
use_module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" 
use_model = hub.load(use_module_url)

# # BERT model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
bert_model.eval()

# bert_model = SentenceTransformer('all-mpnet-base-v2')
# def get_embedding_bert(text):
#     text = text.replace("\n", " ")
#     embedding = bert_model.encode(text)
#     return list(embedding)

















































































BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

again, for memory purposes you cand delete the models you download AFTER you use them!

In [None]:
#del dcp_embeddings

### functions for coherence computations

In [5]:
def embed(text, embedding_type):
    '''
    returns a single embedding (inter) or a list of embeddings (intra)
    for a specified string and embedding_type
    '''
    
    # universal sentence encoder
    if embedding_type == 'USE_inter':
        return use_model.signatures([text])[0]
    
    if embedding_type == 'USE_intra':
        embeddings = []
        for word in text.split():
            embeddings.append(use_model([word])[0])
        return embeddings
    
    # embeddings from language models - interwindow
    elif embedding_type == 'ELMo_inter':
        return elmo_model(tf.constant([text]))["default"].numpy()[0]
    
    # embeddings from language models - intrawindow
    elif embedding_type == 'ELMo_intra':
        embeddings_tensor = elmo_model(tf.constant([text]))
        word_embeddings = embeddings_tensor['word_emb'][0]
        word_embeddings_unpacked = [x.numpy() for x in tf.unstack(word_embeddings)] 
        return word_embeddings_unpacked
    
    # BERT interwindow
    elif embedding_type == 'BERT_inter':
        # interwindow
        tokenized_text = bert_tokenizer.encode(text)
        # convert indexed tokens in a PyTorch tensor
        input_ids = torch.tensor(tokenized_text).unsqueeze(0)
        # run the input tensor through the BertModel
        # see text in above cell for what is contained in outputs variable
        outputs = bert_model(input_ids)
        # get the last_hidden_state
        last_hidden_state = outputs[0]
        # last hidden state is dimension (batch_size, sequence_length, hidden_size)
        # we have one batch so grab this single batch - this_batch is a tensor for each token in tokenized_text
        this_batch = last_hidden_state[0]
        #now get the 768 dimension vector for the CLS token (the first in the list) 
        cls_vector = this_batch[0].detach().numpy()
        return cls_vector
    
    
    
    # BERT intrawindow
    elif embedding_type == 'BERT_intra':
        marked_text = "[CLS] " + text + " [SEP]"
        # Tokenize our sentence with the BERT tokenizer.
        tokenized_text = bert_tokenizer.tokenize(marked_text)
        indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        with torch.no_grad():
            outputs = bert_model(tokens_tensor)
            # Evaluating the model will return a different number of objects based on 
            # how it's  configured in the `from_pretrained` call earlier. In this case, 
            # becase we set `output_hidden_states = True`, the third item will be the 
            # hidden states from all layers. See the documentation for more details:
            # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
            hidden_states = outputs[2]
            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            # Swap dimensions 0 and 1 so we can loop through the embeddings
            token_embeddings = token_embeddings.permute(1,0,2)
            # Stores the token vectors, with shape [N tokens x 768]
            token_vecs_sum = []
            # For each token in the sentence...
            for token in token_embeddings[1:-1]:
                # Sum the vectors from the last four layers.
                sum_vec = torch.sum(token[-4:], dim=0)
                # Use `sum_vec` to represent `token`.
                token_vecs_sum.append(sum_vec)
        return token_vecs_sum
             
    # word2vec: don't count, predict! 
    elif embedding_type == 'DCP_inter': 
        words = str(text).translate(str.maketrans("", "", string.punctuation)).lower()
        vecs = []
        for word in words.split():
            if word in dcp_embeddings:
                vecs.append(dcp_embeddings[word])
        if len(vecs) == 0:
            return None
        else:
            return vector_sum(vecs)
           
    # word2vec: don't count, predict! 
    elif embedding_type == 'DCP_intra': 
        words = str(text).translate(str.maketrans("", "", string.punctuation)).lower()
        vecs = []
        for word in words.split():
            if word in dcp_embeddings:
                vecs.append(dcp_embeddings[word])
        if len(vecs) == 0:
            return None
        else:
            return vecs
            
    # word2vec: google news 
    elif embedding_type == 'W2V_inter':
        words = str(text).translate(str.maketrans("", "", string.punctuation)).lower()
        vecs = []
        for word in words.split():
            if word in w2v_embeddings.key_to_index:
                vecs.append(w2v_embeddings[word])
        if len(vecs) == 0:
            return None
        else:
            return vector_sum(vecs)
   
    # word2vec: google news 
    elif embedding_type == 'W2V_intra':
        words = str(text).translate(str.maketrans("", "", string.punctuation)).lower()
        vecs = []
        for word in words.split():
            if word in w2v_embeddings.key_to_index:
                vecs.append(w2v_embeddings[word])
        if len(vecs) == 0:
            return None
        else:
            return vecs
            
    # GloVe
    elif embedding_type == 'GloVe_inter':
        words = str(text).translate(str.maketrans("", "", string.punctuation)).lower()
        vecs = []
        for word in words.split():
            if word in glove_embeddings:
                vecs.append(glove_embeddings[word])
        if len(vecs) == 0:
            return None
        else:
            return vector_sum(vecs)
   
    # GloVe
    elif embedding_type == 'GloVe_intra':
        words = str(text).translate(str.maketrans("", "", string.punctuation)).lower()
        vecs = []
        for word in words.split():
            if word in glove_embeddings:
                vecs.append(glove_embeddings[word])
        if len(vecs) == 0:
            return None
        else:
            return vecs

    else:
        print('Incorrect embedding type')
        return None
        
        
def vector_sum(vectors):
    '''
    given a list of vectors for a sentence, return the sum of all vectors
    this is used to create ngram+ embeddings for non-contextualized embedding types
    '''
    n = len(vectors)
    d = len(vectors[0])

    #create an array initialized to 0 of the same length of the word embeddings
    s = [0 for i in range(d)]

    #add each word embedding to the zero vector
    for vector in vectors:
        s = s + np.array(vector)

    return list(s)


def get_intra_window_cosines(text, embedding_type):
    '''
    iterate through a window comparing each word to each other word
    '''
    
    all_embeddings = embed(text, embedding_type+'_intra')
    
    if all_embeddings:
        if len(all_embeddings) < 2:
            return None

        all_cosines = []
        for a, b in itertools.combinations(all_embeddings, 2):
            cos = 1 - spatial.distance.cosine(a, b)
            all_cosines.append(cos)

        return all_cosines
    
    else:
        return None


def get_inter_window_cosine(text1, text2, embedding_type):
    
    e1 = embed(text1, embedding_type+'_inter')
    e2 = embed(text2, embedding_type+'_inter')
    if embedding_type == 'USE' or embedding_type == 'ELMo' or embedding_type == 'BERT':
        return 1 - spatial.distance.cosine(e1, e2)

    else:
        if e1 and e2:
            return 1 - spatial.distance.cosine(e1, e2)
        else:
            return None
    
def get_ngrams(text, n):
    '''
    return a list of n-grams
    '''

    n_grams = ngrams(word_tokenize(text), n)
    
    return [' '.join(grams) for grams in n_grams]


def get_slope(nums):
    '''
    compute the slope of a list of cosines
    '''
    x = range(len(nums))
    y = nums

    slope, intercept, r_value, p_value, std_err = linregress(x, y)

    return slope

### between window coherence (interwindow) 

In [None]:
use_module_url ="https://tfhub.dev/google/universal-sentence-encoder/2"
use_model = hub.load(use_module_url)
def get_embedding_USE(text):
    embedding = use_model.signatures(text)
    return embedding

In [23]:
embedding='BERT'
num_utterances=len(DATA.TRANSCRIPT)
cosines=[]
mean_coherence = []
std_coherence = []
min_coherence = []
max_coherence = []
slope_coherence = []
for i in range(num_utterances-1):
    res= get_inter_window_cosine(DATA.TRANSCRIPT[i], DATA.TRANSCRIPT[i+1], embedding)
    if res:
        cosines.append(res)


In [24]:

cosines.insert(0,0)


In [25]:
DATA[f'coherence_{embedding}_interwindow'] = cosines


In [None]:
DATA[f'mean_coherence_{embedding}_interwindow'] = mean_coherence
DATA[f'std_coherence_{embedding}_interwindow'] = std_coherence
DATA[f'min_coherence_{embedding}_interwindow'] = min_coherence
DATA[f'max_coherence_{embedding}_interwindow'] = max_coherence


ValueError: Length of values (5275) does not match length of index (5276)

### sentence entropy

In [None]:
# BERT 
## loading this here because they have the same name as the 
## previous (different) BERT tokenizer 

from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained model (weights)
with torch.no_grad():
    bert_model = BertForMaskedLM.from_pretrained('bert-large-cased')
    bert_model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    
def get_BERT_score(sentence):
    tokenize_input = bert_tokenizer.tokenize(sentence)
    tokenize_input = ["[CLS]"]+tokenize_input+["[SEP]"]
    tensor_input = torch.tensor([bert_tokenizer.convert_tokens_to_ids(tokenize_input)])
    with torch.no_grad():
        loss = bert_model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.detach().numpy())

# GPT 2
with torch.no_grad():
    gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
    gpt2_model.eval()
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 
def get_GPT_score(sentence):
    tokenize_input = gpt2_tokenizer.encode(sentence)
    tensor_input = torch.tensor([tokenize_input])
    loss = gpt2_model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.detach().numpy())

In [None]:
embedding = 'BERT' # swap out for GPT2

avg_sentence_probabilities = []
min_sentence_probabilities = []
max_sentence_probabilities = []
stdv_sentence_probabilities = []
firstquartile_sentence_probabilities = []
median_sentence_probabilities = []
thirdquartile_sentence_probabilities = []

for utterance in DATA.TRANSCRIPT:

    all_utterances = utterance.lower()
    
    sentences = sent_tokenize(all_utterances)
    
    sentence_probs = []

    if embedding == 'BERT':
        for text in sentences:
            if len(text.split()) > 2:
                sentence_probs.append(get_BERT_score(text))
                
    elif embedding == 'GPT2':
        for text in sentences:
            if len(text.split()) > 2:
                sentence_probs.append(get_GPT_score(text))
                
    else:
        print('incorrect embedding type')
        break
                
    if len(sentence_probs) > 0:
        firstquartile_sentence_probabilities.append(np.percentile(sentence_probs, 25))
        median_sentence_probabilities.append(np.percentile(sentence_probs, 50))
        thirdquartile_sentence_probabilities.append(np.percentile(sentence_probs, 75))
        avg_sentence_probabilities.append(np.array(sentence_probs).mean())
        min_sentence_probabilities.append(min(sentence_probs))
        max_sentence_probabilities.append(max(sentence_probs))
        stdv_sentence_probabilities.append(np.array(sentence_probs).std())
    else:
        # there were no sentences > 2 words...
        firstquartile_sentence_probabilities.append(np.nan)
        median_sentence_probabilities.append(np.nan)
        thirdquartile_sentence_probabilities.append(np.nan)
        avg_sentence_probabilities.append(np.nan)
        min_sentence_probabilities.append(np.nan)
        max_sentence_probabilities.append(np.nan)
        stdv_sentence_probabilities.append(np.nan)  
        
DATA[f'mean_sentence_probability_{embedding}'] = avg_sentence_probabilities
DATA[f'min_sentence_probability_{embedding}'] = min_sentence_probabilities
DATA[f'max_sentence_probability_{embedding}'] = max_sentence_probabilities
DATA[f'stdv_sentence_probability_{embedding}'] = stdv_sentence_probabilities
DATA[f'firstquartile_sentence_probability_{embedding}'] = firstquartile_sentence_probabilities
DATA[f'median_sentence_probability_{embedding}'] = median_sentence_probabilities
DATA[f'thirdquartile_sentence_probability_{embedding}'] = thirdquartile_sentence_probabilities


In [None]:
DATA.to_excel('Free_Speech_Features_Final.xlsx')

# Computing the f statistics for individual features

In [None]:
for column in DATA.columns:
    
    if column != 'FILE' and column != 'TRANSCRIPT' and column != 'GROUP' and column != 'participant_parse_depth_per_sentence':
        
        ones = DATA[DATA.GROUP=='HC']
        twos = DATA[DATA.GROUP=='aMCI']
        threes = DATA[DATA.GROUP=='AD']
        twosthrees = DATA[(DATA.GROUP=='aMCI')|(DATA.GROUP=='AD')]
        
        f1, p1 = stats.f_oneway(ones[column].dropna(), twos[column].dropna(), threes[column].dropna())
        f2, p2 = stats.f_oneway(ones[column].dropna(), twos[column].dropna())
        f3, p3 = stats.f_oneway(ones[column].dropna(), threes[column].dropna())
        f4, p4 = stats.f_oneway(twos[column].dropna(), threes[column].dropna())
        f5, p5 = stats.f_oneway(ones[column].dropna(), twosthrees[column].dropna())

        print(column, 'overall', (f1,p1), '1vs2', (f2,p2),'1vs3', (f3,p3),'2vs3', (f4,p4), '1vs23', (f5,p5))
        