**Step 1**  

1. Import ``align`` and its dependencies.  
1. Import ``tagfixer`` module from wherever you have saved it

In [1]:
### Import dependencies
import align, os
import pandas as pd
import warnings
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

### Import tagfixer module
import sys
tagfixerpath = '/Users/ethan/Documents/GitHub/align-tag-fixer/src/'
sys.path.append(tagfixerpath)
import tagfixer


[nltk_data] Downloading package punkt to /Users/ethan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ethan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/ethan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ethan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


**Step 2**

Set up folder structure  


In [2]:
project_name = 'my-project'
path_to_google_news_vectors = '/Users/ethan/Documents/ALIGN/archive/GoogleNews-vectors-negative300.bin'

BASE_PATH = os.getcwd()

PROJECT_FOLDER = os.path.join(BASE_PATH,
                              project_name)
if not os.path.exists(PROJECT_FOLDER):
    os.makedirs(PROJECT_FOLDER)

TRANSCRIPTS = os.path.join(PROJECT_FOLDER,
                                   'input/')
if not os.path.exists(TRANSCRIPTS):
    os.makedirs(TRANSCRIPTS)


PREPPED_TRANSCRIPTS = os.path.join(PROJECT_FOLDER,
                                   'prepped/')
if not os.path.exists(PREPPED_TRANSCRIPTS):
    os.makedirs(PREPPED_TRANSCRIPTS)

ANALYSIS_READY = os.path.join(PROJECT_FOLDER,
                              'analysis/')
if not os.path.exists(ANALYSIS_READY):
    os.makedirs(ANALYSIS_READY)

SURROGATE_TRANSCRIPTS = os.path.join(PROJECT_FOLDER,
                                     'surrogate/')
if not os.path.exists(SURROGATE_TRANSCRIPTS):
    os.makedirs(SURROGATE_TRANSCRIPTS)


OPTIONAL_PATHS = os.path.join(PROJECT_FOLDER,
                             'optional_directories/')
if not os.path.exists(OPTIONAL_PATHS):
    os.makedirs(OPTIONAL_PATHS)

PRETRAINED_INPUT_FILE = os.path.join(OPTIONAL_PATHS,
                            path_to_google_news_vectors)
if os.path.exists(PRETRAINED_INPUT_FILE) == False:
    warnings.warn('Google News vector not found at the specified '
                      'location. Please update the file path with '
                      'the .bin file that can be accessed here: '
                      'https://code.google.com/archive/p/word2vec/ '
                      '- Alternatively, comment out the `PRETRAINED_INPUT_FILE` information')

**Step 3**

run ``tagfixer.chat2penn``. Arguments are:

1. ``cut_file_folder``: a string with the path to a folder that contains all the .cut files with custom lexemes
2. ``output_directory``: a string with the path to a folder where you want the function to place a .csv file with the custom lexemes, the original chat POS tags, and the corresponding Penn POS tags
3. ``lookup_table``: a string with the path to a csv file which contains the translation from chat to penn tag formats. Must contain two columns: "chat_POS" and "penn_POS"

In [3]:
pathin='/Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/cut_files/'
pathout='/Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/cut_files/'
lookup_table='/Users/ethan/Documents/GitHub/align-tag-fixer/res/chat_POS_lookup-table.csv'

tagfixer.chat2penn(cut_file_folder=pathin, 
                   output_directory=pathout,
                   lookup_table=lookup_table)

**Step 4**

Instead of the ALIGN ``prepare_transcripts`` function, use the tagfixer ``prepare_transcripts`` version.

- make sure to set ``run_spell_check`` to ``False``
- set ``custom_dictionary`` to ``True``. If it is set to ``False``, ``tagfixer.prepare_transcrips`` should work just like ``align.prepare_transcripts```
- set ``path_to_custom_dictionary`` to the path to the csv file created by ``tagfixer.chat2penn``

In [4]:


custom_dictionary = '/Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/cut_files/_custom_chat_tags.csv'

model_store = tagfixer.prepare_transcripts(
                        input_files=TRANSCRIPTS,
                        input_as_directory=True,
                        output_file_directory=PREPPED_TRANSCRIPTS,
                        run_spell_check=False,
                        minwords=2,
                        use_filler_list=None,
                        filler_regex_and_list=False,
                        training_dictionary=None,
                        add_stanford_tags=False,
                            ## if you want to run the Stanford POS tagger, be sure to set `add_stanford_tags` to `True` and uncomment the next two lines
                            # stanford_pos_path=STANFORD_POS_PATH,
                            # stanford_language_path=STANFORD_LANGUAGE,
                        save_concatenated_dataframe=True,
                        custom_dictionary=True,
                        path_to_custom_dictionary=custom_dictionary)
                        

Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/input/Dyad3-time1-condTD.txt
Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/input/Dyad2-time1-condTD.txt
Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/input/Dyad1-time1-condTD.txt
Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/input/Dyad4-time1-condTD.txt


**Step 5**

You can check to see what custom words were identified in the transcripts, and in which files. With a bit more work, this could be modified to show what Penn tag they have been assigned, but at least with this information you can search for some of the words in the prepped transcripts, and check that everything seems correct.  

- ``input_folder`` is a string with the path to the prepped transcript files
- ``path_to_custom_dictionary`` is a string with the path to the csv file created by ``tagfixer.chat2penn``

In [5]:
input_folder = '/Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/prepped/'
custom_dictionary = '/Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/cut_files/_custom_chat_tags.csv'

cus_words = tagfixer.find_custom_words(input_folder, path_to_custom_dictionary=custom_dictionary)
cus_words.to_csv('cus_words.csv')

** Steps 6 and 7**

From here on, you're back to the original ALIGN workflow!

In [6]:
# set standards to be used for real and surrogate
INPUT_FILES = PREPPED_TRANSCRIPTS
MAXNGRAM = 2
USE_PRETRAINED_VECTORS = True
SEMANTIC_MODEL_INPUT_FILE = os.path.join(PROJECT_FOLDER,
                                         'align_concatenated_dataframe.txt')
PRETRAINED_FILE_DIRECTORY = PRETRAINED_INPUT_FILE
ADD_STANFORD_TAGS = False
IGNORE_DUPLICATES = True
HIGH_SD_CUTOFF = 3
LOW_N_CUTOFF = 1

In [7]:
[turn_real,convo_real] = align.calculate_alignment(
                            input_files=INPUT_FILES,
                            maxngram=MAXNGRAM,   
                            use_pretrained_vectors=USE_PRETRAINED_VECTORS,
                            pretrained_input_file=PRETRAINED_INPUT_FILE,
                            semantic_model_input_file=SEMANTIC_MODEL_INPUT_FILE,
                            output_file_directory=ANALYSIS_READY,
                            add_stanford_tags=ADD_STANFORD_TAGS,
                            ignore_duplicates=IGNORE_DUPLICATES,
                            high_sd_cutoff=HIGH_SD_CUTOFF,
                            low_n_cutoff=LOW_N_CUTOFF)


Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/prepped/Dyad3-time1-condTD.txt
Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/prepped/Dyad2-time1-condTD.txt
Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/prepped/Dyad1-time1-condTD.txt
Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/prepped/Dyad4-time1-condTD.txt


In [8]:
[turn_surrogate,convo_surrogate] = align.calculate_baseline_alignment(
                                    input_files=INPUT_FILES, 
                                    maxngram=MAXNGRAM,
                                    use_pretrained_vectors=USE_PRETRAINED_VECTORS,
                                    pretrained_input_file=PRETRAINED_INPUT_FILE,
                                    semantic_model_input_file=SEMANTIC_MODEL_INPUT_FILE,
                                    output_file_directory=ANALYSIS_READY,
                                    add_stanford_tags=ADD_STANFORD_TAGS,
                                    ignore_duplicates=IGNORE_DUPLICATES,
                                    high_sd_cutoff=HIGH_SD_CUTOFF,
                                    low_n_cutoff=LOW_N_CUTOFF,
                                    surrogate_file_directory=SURROGATE_TRANSCRIPTS,
                                    all_surrogates=False,
                                    keep_original_turn_order=True,
                                    #id_separator='',
                                    dyad_label='time',
                                    condition_label='cond')

Processing: /Users/ethan/Documents/GitHub/align-tag-fixer/samples/my-project/surrogate/surrogate_run-1688125107.5451908/SurrogatePair-time1A-time1B-condTD.txt
