# `survey.ipynb`

Convert [SmartCat](https://smartcat.com/) *.txt files w/ two column to *-translated.csv files w/ three columns as follows:
- Read `*-EN.csv` three-column file (last column blank) where the first column heading is `'PhraseID'`. (Remove leading `'\ufeff'` prefix character from `'PhraseID'` header.)
- Read `*.txt` tab-delimited [UTF-16](https://en.wikipedia.org/wiki/UTF-16) binary file(s) downloaded from [https://smartcat.com/](https://smartcat.com/) as follows:
  - Map `'PhraseID'` (equivalent) column value as the key and `'EN'` (equivalent) column value as the value. Column headings vary with each language.
  - Assume all `'PhraseID'` (equivalent) values exist in both files.
  - Replace all `'\u00a0'` characters (HTML entity `'&nbsp;'`) w/ `' '`.
- Combine `'PhraseID'`, `'EN'`, and [strip](https://docs.python.org/3.4/library/stdtypes.html#str.strip)ped translation columns matching `'PhraseID'` key or translation.
  - Keys *match* if the English key itself, or a translation of the English key, is in the translated file.
  - Leave blank any translations w/o text.
- Write out the new .CSV file.

## Assumptions and hardwired values

This [notebook](https://jupyter.org/) was setup for specific English, Portuguese, and Spanish data files. Many of the values have been hardwired into the code and must be updated if anything changes with the survey and its translations or if other translations are to be added.

- The `*-EN.csv` English source file and all the `*-translated.csv` result files default to [UTF-8](https://en.wikipedia.org/wiki/UTF-8) encoding in the code.
- The `*.txt` translated source files default to [UTF-16](https://en.wikipedia.org/wiki/UTF-16) encoding in the code.
- The `directory` and pathnames for `path_EN`, `path_EN`, and `path_EN` are in the code.
- The translated language column headings `'PT-BR'` and `'ES-ES'` are in the code.
- The `'PhraseID'` substitution values (and the equivalent headers) are in the code:
```python
    replacements = (
        ('Resposta', 'Answer', ),                                               # Português
        ('opción', 'Choice', ), ('Elección', 'Choice', ), ('Nivel', 'Level', ), # Español
    )
```
- The Boolean variables `check_for_extra_spaces` and `check_for_missing_keys` for debugging are in the code and initialized to `False`.

In [10]:
#!/usr/bin/env python3
#
# convert.py
#
# Convert SmartCat *.txt files w/ two column to *-translated.csv files w/ three columns as follows:
# - Read *-EN.csv three-column file (last column blank) where the first column heading is 'PhraseID'.
#   (Remove leading '\ufeff' prefix character from 'PhraseID' header.)
# - Read *.txt tab-delimited UTF-16 binary file(s) downloaded from https://smartcat.com/ as follows:
#   - Map 'PhraseID' (equivalent) column value as the key and 'EN' (equivalent) column value as the
#     value. Column headings vary with each language.
#   - Assume all 'PhraseID' (equivalent) values exist in both files.
#   - Replace all '\u00a0' characters (HTML entity '&nbsp;') w/ ' '.
# - Combine 'PhraseID', 'EN', and stripped translation columns matching 'PhraseID' key or translation.
#   - Keys 'match' if the English key, or a translation of the English key, is in the translated file.
#   - Leave blank any translations w/o text.
# - Write out the new .CSV file.
#

import csv, os.path

directory = '.'
path_EN = os.path.join(directory, 'Cambridge PD Conjoint-EN.csv')
path_PT = os.path.join(directory, 'Cambridge PD Conjoint-EN-PT(pt-BR).txt')
path_ES = os.path.join(directory, 'Cambridge PD Conjoint-EN-ES-ES(es-US).txt')

with open(path_EN, 'r') as f1, \
     open(path_PT, 'r', encoding='utf16') as f2, \
     open(path_ES, 'r', encoding='utf16') as f3:
    reader_EN = csv.reader(f1)
    reader_PT = csv.DictReader(f2, delimiter='\t')
    reader_ES = csv.DictReader(f3, delimiter='\t')
    # english = [ [ cell.strip() for cell in row ] for row in reader_EN ]
    # TODO: do *not* strip() the English input for the EN column (though, probably OK)
    english = [ row for row in reader_EN ]
    portuguese = { row['ID da frase']: row['IN'] for row in reader_PT }
    spanish = { row['PhraseID']: row['EN'] for row in reader_ES }

check_for_extra_spaces = False
if check_for_extra_spaces:
    print('EN', [ f"'{text}'" for row in english for text in row if text.strip() != text ], '\n')
    print('PT', [ f"'{text}'" for text in portuguese.values() if text.strip() != text ], '\n')
    print('ES', [ f"'{text}'" for text in spanish.values() if text.strip() != text ], '\n')
    print(f"'{spanish.get('CBConjoint_Feature6_Level1', '?')}'")
    print(f"'{spanish.get('QID75_Choice7', '?')}'")
check_for_missing_keys = False
if check_for_missing_keys:
    keys = { row[0] for row in english }
    print('PT', [ key for key in portuguese if key not in keys ])
    print('ES', [ key for key in spanish if key not in keys ])

def value(translations, key):
    """Return translations[key].strip() if it exists, otherwise translations[translated_key]
    if it exists, otherwise ''."""
    replacements = (
        ('Resposta', 'Answer', ),                                               # Português
        ('opción', 'Choice', ), ('Elección', 'Choice', ), ('Nivel', 'Level', ), # Español
    )
    if key in translations:
        return translations[key].strip()
    for trans, eng in replacements:
        translated_key = key.replace(eng, trans)
        if translated_key in translations:
            return translations[translated_key].strip()
    print(f"No key matches '{key}'")
    return ''

# Combine English / Portugeuse – remove leading '\ufeff' from 0th row.
language = 'PT-BR'
combined_EN_PT = [ [ 'PhraseID', 'EN', language, ] ] + \
    [ [ row[0], row[1], value(portuguese, row[0]), ] for row in english[1: ] ]

# Write out translated .CSV.
root, ext = os.path.splitext(path_PT)
path_out = f"{root}-translated.csv"
print(f"Writing {path_out}...")
with open(path_out, 'w') as fout:
    writer = csv.writer(fout, quoting=csv.QUOTE_ALL)
    writer.writerows(combined_EN_PT)

# Combine English / Español – remove leading '\ufeff' from 0th row.
language = 'ES-ES'
combined_EN_ES = [ [ 'PhraseID', 'EN', language, ] ] + \
    [ [ row[0], row[1], value(spanish, row[0]), ] for row in english[1: ] ]

# Write out translated .CSV.
root, ext = os.path.splitext(path_ES)
path_out = f"{root}-translated.csv"
print(f"Writing {path_out}...")
with open(path_out, 'w') as fout:
    writer = csv.writer(fout, quoting=csv.QUOTE_ALL)
    writer.writerows(combined_EN_ES)

Writing ./Cambridge PD Conjoint-EN-PT(pt-BR)-translated.csv...
Writing ./Cambridge PD Conjoint-EN-ES-ES(es-US)-translated.csv...
