In [None]:
import os
from collections import defaultdict
import numpy as np
import textract 
import pandas as pd

You probably have to install the textract package to process text from the word documents. 

# Cleaning up strings

In [None]:
PUNCTUATION = ['.', '!', ',', ';', '~', "’", "'", '\n', '\t']
def remove_punc(text):
    for c in PUNCTUATION:
        text = text.replace(c," ")
    return text

In [None]:
def remove_whitespace(text):
    text = ' '.join(text.split())
    return text

In [None]:
def clean_text(text):
    text = text.decode("utf-8") 
    text = remove_punc(text)
    text = remove_whitespace(text)
    return text

# VA

Change the code to reflect directory organization. Mine is transcripts -> VA -> 1_Provider, 2_Providers.

We are using a default dictionary because it will keep Doctor_2, Convo_2 nan as specified by the defaultdict initialization. We want this to be nan since patients in VA only have one conversation with a urologist. 

In [None]:
patient_info_va = defaultdict(lambda: {'Convo_1': np.nan, 'Convo_2': np.nan, 'Dataset': np.nan, 'Doctor_1': np.nan, 'Doctor_2': np.nan})
paths = ['transcripts/VA/1_Provider','transcripts/VA/2_Providers']

for path in paths: # loop through the various VA directories
    for filename in os.listdir(path):
        full_path = path+'/'+filename
        if filename[-4:] == 'docx' or filename[-3:] == 'doc': # we don't want any other files such as .DS_Store to show up
            txt = textract.process(full_path)
            new_txt = clean_text(txt)
            patient_info_va[filename[:4]]['Convo_1'] = new_txt
            patient_info_va[filename[:4]]['Dataset'] = 'VA'
            patient_info_va[filename[:4]]['Doctor_1'] = 'U'

In [None]:
# Convert dictionary from above into a dataframe
pd.set_option('display.max_columns', None) 
df_va = pd.DataFrame.from_dict(patient_info_va, orient='index')
df_va.index.name = 'patient_id'
df_va.reset_index(level=0, inplace=True)
df_va

# DVD

Change the code to reflect directory organization. Mine is transcripts -> DVD -> 1_Provider, 2_Providers, 3_Providers.

We are using a default dictionary because it will initialize all elements to nan (which is helpful if patient only saw a single doctor).

In [None]:
patient_info_dvd = defaultdict(lambda: {'Convo_1': np.nan, 'Convo_2': np.nan, 'Dataset': np.nan, 'Doctor_1': np.nan, 'Doctor_2': np.nan})
paths = ['transcripts/DVD/1_Provider','transcripts/DVD/2_Providers', 'transcripts/DVD/3_Providers']
counter = 0
for path in paths:
    for filename in os.listdir(path):
        counter += 1
        full_path = path+'/'+filename
        if filename[-4:] == 'docx' or filename[-3:] == 'doc': # we don't want any other files such as .DS_Store to show up
            txt = textract.process(full_path)
            new_txt = clean_text(txt)
            patient_info_dvd[filename[:4]]['Dataset'] = 'DVD'
            doctor = filename[5] # indicates whether patient saw radiologist or urologist 
            
            if filename[7] == '1': # categorize as first or second conversation based on filename
                patient_info_dvd[filename[:4]]['Convo_1'] = new_txt
                patient_info_dvd[filename[:4]]['Doctor_1'] = doctor
            else:
                patient_info_dvd[filename[:4]]['Convo_2'] = new_txt
                patient_info_dvd[filename[:4]]['Doctor_2'] = doctor

In [None]:
df_dvd = pd.DataFrame.from_dict(patient_info_dvd, orient='index')
df_dvd.index.name = 'patient_id'
df_dvd.reset_index(level=0, inplace=True)
df_dvd

# Concatenate the two transcript datasets


The dataset has the following columns: 'patient_id', 'Convo_1', 'Convo_2', 'Dataset', 'Doctor_1', and 'Doctor_2'. 'patient_id' is self-explanatory, and is taken from the transcript filename. 'Convo_1' is the transcript of the participant's first conversation (since patients in VA only have a single conversation, the conversation transcript is always shown in 'Convo_1' and for DVD, it depends on the filename). 'Dataset' is either DVD or VA. 'Doctor_1' indicates which doctor patient saw first (U or R). Similarly, Doctor_2 indicates which doctor patient saw second. Since VA patients only see urologists, this is always U for them.  

In [None]:
df_combined = pd.concat([df_va, df_dvd], axis=0)

In [None]:
df_combined.head()

# Generate a CSV

In [None]:
df_combined.to_csv('all_transcripts.csv')