In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import nltk 

In [None]:
DATA_DIR = 'E:\\CS224S_final_Project\\data\\raw_data'

In [None]:
train_data = "{}\\..\\train_split_Depression_AVEC2017.csv".format(DATA_DIR)
train_df = pd.read_csv(train_data, header=0)

In [None]:
dev_data = "{}\\..\\dev_split_Depression_AVEC2017.csv".format(DATA_DIR)
dev_df = pd.read_csv(dev_data, header=0)

In [None]:
# Number of depressed and not depressed

def num_for_classes(df):
    num_not_depressed = len(df[df.PHQ8_Binary == 0]) - 1
    num_depressed = len(df[df.PHQ8_Binary == 1]) - 1
    return num_not_depressed, num_depressed

In [None]:
# Time length, number of words, and number of unique words spoken by participant

def info_per_session(df):
    num_words_per_session = []
    num_unique_words_per_session = []
    length_time_participant_per_session = []
    participants = []

    for _, df_row in df.iterrows():
        p_num = int(df_row['Participant_ID'])
        participants.append("Participant" + str(p_num))
        transcript_path = "{}\\{}_P\\{}_TRANSCRIPT.csv".format(DATA_DIR, p_num, p_num)
        transcript_df = pd.read_csv(transcript_path, delimiter='\t', header=0)
        transcript_df = transcript_df[transcript_df.speaker != "Ellie"]

        length_time_participant = np.sum(np.array(transcript_df['stop_time']) - np.array(transcript_df['start_time']))

        num_words = 0
        unique_words = set()
        for index, row in transcript_df.iterrows():
            trascript = row['value']
            trascript = re.sub('<.*>', '', trascript)
            words = nltk.word_tokenize(trascript)
            num_words += len(words)
            unique_words = unique_words | set(words)

        num_words_per_session.append(num_words)
        num_unique_words_per_session.append(len(unique_words))
        length_time_participant_per_session.append(length_time_participant)
        break

    return num_words_per_session, num_unique_words_per_session, length_time_participant_per_session, participants

In [None]:
# Number of Samples from each session

def num_samples_per_session(df):
    num_samples_per_session = []
    for _, df_row in df.iterrows():
        p_num = int(df_row['Participant_ID'])
        df = pd.read_pickle(str(p_num) +'.pkl')
        num_samples_per_session.append(len(df) - 1)
        break
    return num_samples_per_session

In [None]:
IMAGES_DIR = 'E:\\CS224S_final_Project\\data\\images'

In [None]:
# Training Dataset Info
training_num_not_depressed, training_num_depressed = num_for_classes(train_df)
training_num_words_per_session, training_num_unique_words_per_session, training_length_time_participant_per_session, training_participants = info_per_session(train_df)
training_num_samples_per_session = num_samples_per_session(train_df)

plt.figure(1)
plt.title('Classificaion in Training Data')
plt.ylabel('Number of Patients')
plt.bar(['Depressed', 'Not Depressed'], [training_num_depressed, training_num_not_depressed])
plt.savefig(IMAGES_DIR + '/classification_training.png')

plt.figure(2, figsize = (5,5))
plt.title('Total Number of Words per Participant in Training Data')
plt.ylabel('Total Number of Words')
plt.boxplot(training_num_words_per_session)
plt.savefig(IMAGES_DIR + '/num_words_training.png')

plt.figure(3, figsize = (5,5))
plt.title('Number of Unique Words per Participant in Training Data')
plt.ylabel('Number of Unique Words')
plt.boxplot(training_num_unique_words_per_session)
plt.savefig(IMAGES_DIR + '/num_unique_words_training.png')

plt.figure(4, figsize = (5,5))
plt.title('Length of Recording per Participant in Training Data')
plt.ylabel('Seconds')
plt.boxplot(training_length_time_participant_per_session)
plt.savefig(IMAGES_DIR + '/time_length_training.png')

plt.figure(5, figsize = (5,5))
plt.title('Number of samples per Participant in Training Data')
plt.ylabel('Number of Samples')
plt.boxplot(training_num_samples_per_session)
plt.savefig(IMAGES_DIR + '/num_samples_training.png')

In [None]:
# Dev Dataset Info
dev_num_not_depressed, dev_num_depressed = num_for_classes(dev_df)
dev_num_words_per_session, dev_num_unique_words_per_session, dev_length_time_participant_per_session, dev_participants = info_per_session(dev_df)
dev_num_samples_per_session = num_samples_per_session(dev_df)

plt.figure(6)
plt.title('Classificaion in Validation Data')
plt.ylabel('Number of Patients')
plt.bar(['Depressed', 'Not Depressed'], [dev_num_depressed, dev_num_not_depressed])
plt.savefig(IMAGES_DIR + '/classification_validation.png')

plt.figure(7, figsize = (5,5))
plt.title('Total Number of Words per Participant in Validation Data')
plt.ylabel('Total Number of Words')
plt.boxplot(dev_num_words_per_session)
plt.savefig(IMAGES_DIR + '/num_words_validation.png')

plt.figure(8, figsize = (5,5))
plt.title('Number of Unique Words per Participant in Validation Data')
plt.ylabel('Number of Unique Words')
plt.boxplot(dev_num_unique_words_per_session)
plt.savefig(IMAGES_DIR + '/num_unique_words_validation.png')

plt.figure(9, figsize = (5,5))
plt.title('Length of Recording per Participant in Validation Data')
plt.ylabel('Seconds')
plt.boxplot(dev_length_time_participant_per_session)
plt.savefig(IMAGES_DIR + '/time_length_validation.png')

plt.figure(10, figsize = (5,5))
plt.title('Number of samples per Participant in Validation Data')
plt.ylabel('Number of Samples')
plt.boxplot(dev_num_samples_per_session)
plt.savefig(IMAGES_DIR + '/num_samples_validation.png')