### STEP 1: PREPROCESSING -> Get and read data from DementiaBank .CHA files

In [142]:
# python lib to read CHA files from DementiaBank
import pylangacq as pla
import os
import warnings
import nltk
from nltk import word_tokenize

# Suppress all warnings
warnings.filterwarnings("ignore", category=FutureWarning)

#downloaded CHA files from PittCorpus, Baycrest, Kempler, Deleware, Holland, Lu, WLS

cha_files_main_dir = r"C:\Users\Gianna\OneDrive\Desktop\CHA Files"
dementia_cha_path = r"C:\Users\Gianna\OneDrive\Desktop\CHA Files\Dementia"
mci_cha_path = r"C:\Users\Gianna\OneDrive\Desktop\CHA Files\MCI"
control_cha_path = r"C:\Users\Gianna\OneDrive\Desktop\CHA Files\Control"
uncategorized_cha = []
uncategorized_cha = [os.path.join(cha_files_main_dir, file) for file in os.listdir(cha_files_main_dir) if os.path.isfile(os.path.join(cha_files_main_dir, file))]

# dementia patients data
dementia_files = [file for file in os.listdir(dementia_cha_path) if os.path.isfile(os.path.join(dementia_cha_path, file))]
dementia_folders = [folder for folder in os.listdir(dementia_cha_path) if os.path.isdir(os.path.join(dementia_cha_path, folder))]
dementia_files_all = [os.path.join(dementia_cha_path, folder, file) for folder in dementia_folders for file in os.listdir(os.path.join(dementia_cha_path, folder))]

# mci data
mci_files = [os.path.join(mci_cha_path, file) for file in os.listdir(mci_cha_path)]
mci_files += uncategorized_cha

#control
control_files = [file for file in os.listdir(control_cha_path) if os.path.isfile(os.path.join(control_cha_path, file))]
control_folders = [folder for folder in os.listdir(control_cha_path) if os.path.isdir(os.path.join(control_cha_path, folder))]
control_files_all = [os.path.join(control_cha_path, folder, file) for folder in control_folders for file in os.listdir(os.path.join(control_cha_path, folder))]


In [2]:
# read CHAT files for all categories
AD_chats = pla.Reader.from_files(dementia_files_all)
MCI_chats = pla.Reader.from_files(mci_files)
Control_chats = pla.Reader.from_files(control_files_all)

In [3]:
# function to read chats, use header to filter the participant and condition of participant 
AD_headers = pla.Reader.headers(AD_chats)
MCI_headers = pla.Reader.headers(MCI_chats)
control_headers = pla.Reader.headers(Control_chats)

# create lists of participant info (dictionaries) for each group
AD_participants = []
MCI_participants = []
Control_participants = []

# all chats in correct category for each group
AD_par_chats = []
MCI_par_chats = []
Control_par_chats = []


In [None]:
# function to ensure that all chats are in the correct catgories
# read chats, use file header to filter the participant and condition of participant 
def categorize_header(headers, chats):
    # fix categorization of patients based on headers
    for num_par, header in enumerate(headers):
        if 'Participants' in header.keys():
            patient_dict = header['Participants']['PAR']
            patient_group = patient_dict['group'].lower()

            if 'ad' in patient_group or "alzheimer's" in patient_group:
                AD_participants.append(patient_dict)
                AD_par_chats.append(chats[num_par])

            elif 'mci' in patient_group:
                MCI_participants.append(patient_dict)
                MCI_par_chats.append(chats[num_par])

            elif 'control' in patient_group or patient_dict['corpus'] == 'WLS':
                Control_participants.append(patient_dict)
                Control_par_chats.append(chats[num_par])


categorize_header(AD_headers, AD_chats)
categorize_header(MCI_headers, MCI_chats)
categorize_header(control_headers, Control_chats)

In [116]:
# check how many participants/samples in each group
print(f'{len(AD_participants)} AD patients, {len(MCI_participants)} MCI patients, and {len(Control_participants)} Control subjects')

846 AD patients, 213 MCI patients, and 877 Control subjects


##### Get words from chats and perform preprocessing

In [174]:
# all sentences from all AD participants
AD_words = [AD_chat.words(participants='PAR', by_files=True) for AD_chat in AD_par_chats]

# all MCI files from participants 
MCI_words = [MCI_chat.words(participants='PAR', by_files=True) for MCI_chat in MCI_par_chats]

# all control sentences
control_words = [Control_chat.words(participants='PAR', by_files=True) for Control_chat in Control_par_chats]

AD_words[:2]

[[['mhm',
   '.',
   'alright',
   '.',
   "there's",
   'a',
   'young',
   'boy',
   "that's",
   'getting',
   'a',
   'cookie',
   'jar',
   '.',
   'and',
   "he's",
   'in',
   'bad',
   'shape',
   'because',
   'the',
   'thing',
   'is',
   'falling',
   'over',
   '.',
   'and',
   'in',
   'the',
   'picture',
   'the',
   'mother',
   'is',
   'washing',
   'dishes',
   'and',
   "doesn't",
   'see',
   'it',
   '.',
   'and',
   'so',
   'the',
   'water',
   'is',
   'overflowing',
   'in',
   'the',
   'sink',
   '.',
   'and',
   'the',
   'dishes',
   'might',
   'fall',
   'over',
   'there',
   'if',
   'you',
   "don't",
   'get',
   'it',
   '.',
   'and',
   "it's",
   'a',
   'picture',
   'of',
   'a',
   'kitchen',
   'window',
   '.',
   'and',
   'the',
   'curtains',
   'are',
   'very',
   'distinct',
   '.',
   'but',
   'the',
   'water',
   'is',
   'still',
   'flowing',
   '.']]]

In [175]:
# remove extra quotation marks around words 
AD_words = [' '.join(sentence).replace('"', '').replace("'", '').split(' ') for par_chat in AD_words for sentence in par_chat]
MCI_words = [' '.join(sentence).replace('"', '').replace("'", '').split(' ') for par_chat in MCI_words for sentence in par_chat]
control_words = [' '.join(sentence).replace('"', '').replace("'", '').split(' ') for par_chat in control_words for sentence in par_chat]
AD_words[0]

['mhm',
 '.',
 'alright',
 '.',
 'theres',
 'a',
 'young',
 'boy',
 'thats',
 'getting',
 'a',
 'cookie',
 'jar',
 '.',
 'and',
 'hes',
 'in',
 'bad',
 'shape',
 'because',
 'the',
 'thing',
 'is',
 'falling',
 'over',
 '.',
 'and',
 'in',
 'the',
 'picture',
 'the',
 'mother',
 'is',
 'washing',
 'dishes',
 'and',
 'doesnt',
 'see',
 'it',
 '.',
 'and',
 'so',
 'the',
 'water',
 'is',
 'overflowing',
 'in',
 'the',
 'sink',
 '.',
 'and',
 'the',
 'dishes',
 'might',
 'fall',
 'over',
 'there',
 'if',
 'you',
 'dont',
 'get',
 'it',
 '.',
 'and',
 'its',
 'a',
 'picture',
 'of',
 'a',
 'kitchen',
 'window',
 '.',
 'and',
 'the',
 'curtains',
 'are',
 'very',
 'distinct',
 '.',
 'but',
 'the',
 'water',
 'is',
 'still',
 'flowing',
 '.']

### STEP 2: FEATURE TESTING

Here, I test methods that have been used in past research to distinguish AD samples from non AD samples. Particularly, methods tried are found in a paper by Fraser found here: https://www.cs.toronto.edu/~kfraser/Fraser15-JAD.pdf 

In [178]:
import numpy as np
import random
from nltk.corpus import words
import itertools

eng_corpus = set(words.words())

#### 2.1. Comparing POS tags

One of the elements that has been observed by some in the semantics of dementia patients is a higher frequency of verbs, pronouns, and adjectives compared to nouns. In the following function, I compute the mean of the ratios between pronoun : noun, verb : noun, and adjective : noun for all chats. I then found the mean ratios of the means of all chats and compared between groups.  

Prediction: The greater the ratio returned, the more likely a set of samples will be in the AD/MCI group.

In [117]:
def get_pos_ratios(chats):
    # create an array that stores the average ratio of the ratios -> pronoun: noun, verb: noun, adj: noun
    # ratios closer or greater than 1 indicate higher AD likelihood 
    chat_ratios = []
    for chat in chats:
        num_noun = 0
        num_pronoun = 0
        num_verb = 0
        num_adj = 0
        tokens = chat.tokens(participants='PAR', by_utterances=True)
        for token in tokens:
            for utterance in token:
                # get pos from MOR grammar 
                mor_tier = utterance.to_mor_tier().split('|')
                pos = mor_tier[0]
                if 'n' in pos:
                    num_noun += 1
                if 'v' in pos:
                    num_verb += 1
                if 'adj' in pos:
                    num_adj += 1
                if 'pro' in pos:
                    num_pronoun += 1
        if num_noun != 0:
            # add the ratios from this chat to all ratios 
            chat_ratios.append(np.mean([num_pronoun/num_noun, num_verb/num_noun, num_adj/num_noun]))
        else:
            chat_ratios.append(0)
    return chat_ratios

In [121]:
noun_ratio_AD = np.mean(get_pos_ratios(AD_par_chats))
noun_ratio_MCI = np.mean(get_pos_ratios(MCI_par_chats))
noun_ratio_control = np.mean(get_pos_ratios(Control_par_chats))

print(f'Average noun-noun to noun ratios for each group ->\nAD: {noun_ratio_AD}\nMCI: {noun_ratio_MCI}\nControl: {noun_ratio_control}')

Average noun-noun to noun ratios for each group ->
AD: 0.39824405262825024
MCI: 0.33220975210115683
Control: 0.2935963541446591


It seems that the ratio of non-nouns : nouns is in fact higher in AD patients. To confirm, use a smaller amount of 150 random samples. 

In [129]:
ad_ratios = np.mean(get_pos_ratios(random.sample(AD_par_chats, 150)))
mci_ratios = np.mean(get_pos_ratios(random.sample(MCI_par_chats, 150)))
control_ratios = np.mean(get_pos_ratios(random.sample(Control_par_chats, 150)))

print(f'Average noun-noun to noun ratios for each group ->\nAD: {ad_ratios}\nMCI: {mci_ratios}\nControl: {control_ratios}')

Average noun-noun to noun ratios for each group ->
AD: 0.42044305618618194
MCI: 0.3247101179608221
Control: 0.30215984180620725


It seems that this test can be used as an appropriate measure.

Another element that can be used to test lexical diversity is repetetive phrases or words. Here, I use NLTK corpus to find non-words.

#### 2.2. Comparing non-word ratios

Prediction: A higher non-word frequency is correlated with AD.   

In [191]:
#ratio to find number of nonwords in sentences 
def ratio_nonwords(chat_words):
    nonwords = 0
    total_words = 0
    # collapse 3d chats to 1d list 
    chat_words_flattened = itertools.chain.from_iterable(itertools.chain.from_iterable(chat_words))
    for word in chat_words_flattened:
        # check if in nltk corpus
        if word not in eng_corpus:
            nonwords += 1
        total_words += 1
    return (nonwords / total_words)

In [192]:
ad_nonwords = ratio_nonwords(AD_words)
mci_nonwords = ratio_nonwords(MCI_words)
control_nonwords = ratio_nonwords(control_words)

print(f'Ratios of nonwords for each group ->\nAD: {ad_nonwords}\nMCI: {mci_nonwords}\nControl: {control_nonwords}')

Ratios of nonwords for each group ->
AD: 0.07079032942174467
MCI: 0.044237933194383905
Control: 0.03980657587495594


Confirm with 150 random samples

In [199]:
ad_nonwords = ratio_nonwords(random.sample(AD_words, 150))
mci_nonwords = ratio_nonwords(random.sample(MCI_words, 150))
control_nonwords = ratio_nonwords(random.sample(control_words, 150))

print(f'Ratios of nonwords for each group ->\nAD: {ad_nonwords}\nMCI: {mci_nonwords}\nControl: {control_nonwords}')

Ratios of nonwords for each group ->
AD: 0.063695006881913
MCI: 0.0451481999396781
Control: 0.03952134994273962


#### 2.3. Finding repetetives stem of unique words / total
Prediction: AD patients will use less unique words

#### 2.4. Measuring non-specific speech with word biases
Prediction: Those with AD have a bias to reusing words that are shorter in length and higher frequency words, especially verbs

#### 2.5. Measuring vocabulary richness -> honore statistic

#### 2.6. Syntactic Analysis: CFG

#### 2.7 frequency
unigrams and excluded binary unigrams

#### 2.8. Incomplete sentences

#### 2.9. Filler and question word ratios

#### 2.10. Word entropy

2.11. Cosine similarity

Comparing participant ages