# Data loading

In [2]:
import json
import os
import re

In [3]:
DATA_PATH = 'data'

In [4]:
with open(os.path.join(DATA_PATH, 'corpus_data.json')) as json_file:
    data = json.load(json_file)

In [5]:
data = data['records']

We have 62305 words in 6723 from STT transcripts.

In [6]:
human_transcripts = [entry['human_transcript'] for entry in data]
stt_transcripts   = [entry['stt_transcript'] for entry in data]

In [17]:
human_words = []
stt_words = []
word_classes = []

for entry in data:
    entry_stt_words = []
    entry_hum_words = []
    entry_labels = []
    entry_classes = []

    for word in entry['words']:
        humw = word['human_word']
        sttw = word['stt_word']

        # if either stt or human transcript don't contain a word, skip it
        # TODO: we migh want to split the phrases if we use this for features
        if re.sub(r'@.', '', humw)=='' or sttw=='':
            continue
        
        # if it is a german word/phrase, annotate it
        if '@g' in humw:
            entry_hum_words.append(re.sub(r'@.', '', humw))
            entry_classes.append(1)

        # if the word contains another type of a mistake, just remove annotations - we don't care
        # TODO: we can also add different markings for each type of mistake for the learning algorithm later
        elif '@' in humw:
            entry_hum_words.append(re.sub(r'@.', '', humw))
            entry_classes.append(0)

        # otherwise, just add the word
        else:
            entry_hum_words.append(humw)
            entry_classes.append(0)

        # stt word don't have to be cleaned
        entry_stt_words.append(word['stt_word'])

    human_words.append(entry_hum_words)
    stt_words.append(entry_stt_words)
    word_classes.append(entry_classes)

In [14]:
for s1, s2 in zip(human_words, stt_words):
    for w1, w2 in zip(s1, s2):
        if len(w1.split()) > 1 and len(w2.split()) > 1:
            print(w1, w2)

cool i  coo like
deine lustigste  an lusting
cold have  call that
grandmother there  grand motor where
book in  book in
there is there   thirst where
ter with  to rift in
einkaufszentrum there s   uncas centrum verse
thing but  thing but
i am from   ive rome
the strawberry cake   drapery kick
have pancakes  happen case
candy cates  can indicate
doesn t  does not
anything else  a saga
chocolate chewing  tugler tin
a nice  an it
one centimeter  once timet
or jonglieren  in what
got the  go to
sind getrennt  singe trent
yeah my  am i
she is  she is
reinfa one 4   ranand 0
but it  but it
come be to   compete is
a sun cream   some tree
cola or  coal for
it s cold   is called
it s blue   is clue
eh eine  a iies
we draw  which rage
w reiten wiff   writen with
your cold  you are could
my picture  a romantic there
vorha nge are   for hanging or
play this als   ladies as
i love  il of
eisba r  ice bear
eisba r  ice bear
eisba r  ice bear
eisba r  ice pear
eisba r  ice care
it is  i days
lollipop

In [9]:
for sentence, labels in zip(stt_words, word_classes):
    for word, label in zip(sentence, labels):
        if len(word.split()) > 1 and label:
            print(word)

an lusting
kinder wagon
towber stop
uncap centrum
to rift in
uncas centrum verse
yonkhol sudden
iron coups pockets
cafe test
in what
ipl turn
if turn
singe trent
matematic ye
open a
least cushman this
a an
be note
home fall
talk in
i fur yes
ay and
fall in
writen with
disco coukl
for hanging or
ladies as
was her
ice bear
ice bear
ice bear
ice pear
ice care
short fit
on tenden
la krits
gale la
regan tirm
leaves to
vegas the
reference on the
such mute
up fag
back this
gift i
like see
i got
for shena
what is
i stood
bridge feel
over romped
gregson lire
be lighting in
princess in
free seer
of nahara
no i
him a at
we feel
toe shower
eyes were
ice pear
a sushi
p 0
on ossano
pull over
ash bilde
winfast north
fasbilta ronda
it is
duncle blow
10412 okay
for an
our otho
mettle altar
cottonge beele
coffer in dren
and yes
is alternate
bomb on
body an took
body hose
body howison
both hasten
bade an took
or ranch order
bad motor
do the as
so and
full hung
fore hung
launch we
stagei with
advise okay


A few concerns:
1. Should we ignore other @ annotations, or can they be useful features for network training?
2. Some word entries actually contain multiple words, should we split them? If so, how do we keep the correspondence to the human transcript?
3. Do we use the human transcript for anything other than annotation extraction?

In [15]:
word_list = []
for entry in data:
    for word in entry['words']:
        word_list.append(word['human_word'])
    
characters_after_at = set(char for word in word_list for char in re.findall(r'@(.?)', word))

In [16]:
characters_after_at

{'!', '?', 'g'}

Only characters !, ? or g can be in the annotations.

Let's try another way, with feature and word extraction:

In [27]:
    human_words = []  # human-transcribed words
    stt_words = []  # STT transcribed words
    word_labels = []  # language labels
    word_sems = []  # semantical errors
    word_grams = []  # grammatical errors

    for entry in data:
        entry_stt_words = []
        entry_hum_words = []
        entry_labels = []
        entry_grams = []
        entry_sems = []

        for word in entry["words"]:
            humw = word["human_word"]
            sttw = word["stt_word"]

            for w in sttw.split():
                entry_stt_words.append(w)
                entry_hum_words.append(humw)
                entry_labels.append("@g" in humw)
                entry_grams.append("@!" in humw)
                entry_sems.append("@?" in humw)

        human_words.append(entry_hum_words)
        stt_words.append(entry_stt_words)
        word_labels.append(entry_labels)
        word_grams.append(entry_grams)
        word_sems.append(entry_sems)

I'm not sure if we want to use these features, since the classification task is concerned only with STT transcripts, not human-annotated data.