<a href="https://colab.research.google.com/github/easeto/amira-learning/blob/main/classify_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
import pandas as pd
import string
import re
import numpy as np

In [66]:
# get authentication from google drive
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [67]:
# open the google sheet
sheet = gc.open('classify-error').sheet1

# get_all_values gives a list of rows.
rows = sheet.get_all_values()
df = pd.DataFrame.from_records(rows)
df.columns = df.iloc[0]
df = df.drop(0)

In [69]:
# drop rows that haven't been classified
from numpy import nan
nan_value = float("NaN")
df.replace("", nan_value, inplace=True)
df.dropna(subset = ['error_label_1'], inplace=True)


In [70]:
"""
Compute the Damerau-Levenshtein distance between two given
strings (s1 and s2)
"""
def damerau_levenshtein_distance(s1, s2):
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)
    for i in range(-1,lenstr1+1):
        d[(i,-1)] = i+1
    for j in range(-1,lenstr2+1):
        d[(-1,j)] = j+1

    for i in range(lenstr1):
        for j in range(lenstr2):
            if s1[i] == s2[j]:
                cost = 0
            else:
                cost = 1
            d[(i,j)] = min(
                           d[(i-1,j)] + 1, # deletion
                           d[(i,j-1)] + 1, # insertion
                           d[(i-1,j-1)] + cost, # substitution
                          )
            if i and j and s1[i]==s2[j-1] and s1[i-1] == s2[j]:
                d[(i,j)] = min (d[(i,j)], d[i-2,j-2] + cost) # transposition

    return d[lenstr1-1,lenstr2-1]/((lenstr1+lenstr2)/2)

In [71]:
"""
Check if the two words are similar at the 0.4 threshold
"""
def is_similar(s1, s2):
  if damerau_levenshtein_distance(s1,s2) > 0.4:
    return True
  return False

In [72]:
"""
Remove punctuation 
"""
import re
def remove_punc(strlist):
  result = []
  for word in strlist:
    res = ''.join(e for e in word if e.isalnum())
    result.append(res)
  return result

Data Processing

In [73]:
# transform string data type to list
df['story_text'] = df.apply(lambda x: list(x.story_text.split(",")), axis = 1)
df['transcript'] = df.apply(lambda x: list(x.transcript.split(",")), axis = 1)
df['aligned_text'] = df.apply(lambda x: list(x.aligned_text.split(",")), axis = 1)
df['aligned_transcript'] = df.apply(lambda x: list(x.aligned_transcript.split(",")), axis = 1)
df['error_index'] = df.apply(lambda x: list(x.error_index.split(",")), axis = 1)


In [74]:
# remove punctuations in list
df['story_text'] = df.apply(lambda x: remove_punc(x.story_text), axis = 1)
df['transcript'] = df.apply(lambda x: remove_punc(x.transcript), axis = 1)
df['aligned_text'] = df.apply(lambda x: remove_punc(x.aligned_text), axis = 1)
df['aligned_transcript'] = df.apply(lambda x: remove_punc(x.aligned_transcript), axis = 1)
df['error_index'] = df.apply(lambda x: remove_punc(x.error_index), axis = 1)

In [75]:
def get_error_word(aligned_text, aligned_transcript, error_index):
  """
  function to get the words that are incorrect in the transcript based on the index. 
  :param aligned_text: the text that was showed to students aligned to the transcript of what the students said
  :param aligned_transcript: the transcript of what the student said aligned to the text that was showed
  :param error_inex: the index of the error in aligned text/transcript (aligned text and transcript have the same length)
  :return: index, the word that is correct, the word that is incorrectly said
  """
  result = []
  for index in error_index:
    comp = []
    comp.append(index)
    comp.append(aligned_text[index])
    comp.append(aligned_transcript[index])
    result.append(comp)
  return result

In [76]:
def to_int(index):
  result = []
  for i in index:
    i = int(i)
    result.append(i)
  return result


In [77]:
df['error_index'] =  df.apply(lambda x: to_int(x.error_index), axis = 1)
df['error_words'] = df.apply(lambda x: get_error_word(x.aligned_text, x.aligned_transcript, x.error_index), axis = 1)

Classification Algorithm


In [97]:
def classify(error_words, story_text, transcript):
    """
    A function for classifying the error types
    :param error_word (a list of error index, correct word, incorrect word)
    :param story_text (the sentence that is showed to students)
    :param transcript (the sentence that the students said)
    :return result: index of the error in the story_text and the error type
    """
    result = []
    for error in error_words:
      text = error[1] 
      trans = error[2]
      res = []

      # if the text is empty
      if text == '':
        if trans in story_text and trans in transcript: # student already said the correct word in transcript
          e = 'correct - repetition'
          id = story_text.index(trans)
        elif trans in story_text and trans not in transcript: # student made a close attempt before this word and got this word correctly
          e = 'correct - self-correction'
          id = story_text.index(trans)
        else: # student said something unrelated and cannot be found in the text
          e = 'miscue - unrelated chatter'
          id = error[0]
      
      # if the transcript is empty -> student did not make an attempt at saying the word -> skip
      elif trans == '':
        e = 'miscue - skip'
        id = story_text.index(text)
      
      # if the text and transcript words are aligned
      else:
        if trans in story_text: # if the word is already said -> self-repetition
          e = 'correct - repetition'
          id = story_text.index(trans)
        else:
          if is_similar(text, trans): # if the text and transcript sound/look similar enough
            if text in transcript: # and if the correct word is also said in the transcript -> self-correction
              e = 'correct - self-correction'
              id = story_text.index(text)
            else: # if the correct word was not said -> student made an attempt at saying something similar -> substition
              e = 'miscue - substitution'
              id = error[0]
          else: # if the aligned text and  aligned transcript do not look/sound similar -> no attempt was made -> unrelated chatter
            e = 'miscue - unrelated chatter'
            id = error[0]

      
      res.append(id)
      res.append(trans)
      res.append(e)
      result.append(res)
      
    return result




In [79]:
df['classify'] = df.apply(lambda x: classify(x.error_words, x.story_text, x.transcript), axis = 1)

In [80]:
df[['story_text', 'transcript', 'classify']]

Unnamed: 0,story_text,transcript,classify
1,"[no, it, wasnt, there]","[no, it, was, there]","[[2, miscue - substitution]]"
2,"[baby, bunny, is, a, boy]","[baby, bunny, a, boy]","[[2, miscue - skip]]"
3,"[i, love, bugs]","[i, i]","[[1, miscue - skip], [0, correct - repetition]]"
4,"[the, mouse, ran, down]","[the, mouse, down]","[[2, miscue - skip]]"
5,"[i, need, dad]","[dad, dad]","[[0, miscue - skip], [2, correct - repetition]]"
...,...,...,...
563,"[have, this, wish, i, wish, tonight]","[have, that, have, this, wish, i, wish, tonight]","[[0, correct - repetition], [1, miscue - unrel..."
564,[boot],"[boot, boot]","[[0, correct - repetition]]"
565,"[do, you, know, the, muffin, man]","[do, you, know, the, muffin, the, muffin]","[[3, correct - repetition], [4, correct - repe..."
566,[bird],"[bird, bird]","[[0, correct - repetition]]"


Test Model Accuracy


In [81]:
label = df.columns[df.columns.str.startswith('error_label_')]
#df.drop(unwanted, axis=1, inplace=True)

In [82]:
split_df = pd.DataFrame(df['classify'].tolist(), columns=['e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7'])

In [83]:
split_df

Unnamed: 0,e1,e2,e3,e4,e5,e6,e7
0,"[2, miscue - substitution]",,,,,,
1,"[2, miscue - skip]",,,,,,
2,"[1, miscue - skip]","[0, correct - repetition]",,,,,
3,"[2, miscue - skip]",,,,,,
4,"[0, miscue - skip]","[2, correct - repetition]",,,,,
...,...,...,...,...,...,...,...
515,"[0, correct - repetition]","[1, miscue - unrelated chatter]",,,,,
516,"[0, correct - repetition]",,,,,,
517,"[3, correct - repetition]","[4, correct - repetition]",,,,,
518,"[0, correct - repetition]",,,,,,


In [84]:
df_label = df[label]

In [85]:
df_label

Unnamed: 0,error_label_1,error_label_2,error_label_3,error_label_4,error_label_5,error_label_6,error_label_7
1,miscue - substitution,,,,,,
2,correct - repetition,,,,,,
3,miscue - substitution,,,,,,
4,miscue - skip,,,,,,
5,miscue - skip,miscue - substitution,,,,,
...,...,...,...,...,...,...,...
563,miscue - substitution,,,,,,
564,correct - repetition,,,,,,
565,miscue - substitution,correct - repetition,,,,,
566,correct - repetition,,,,,,


Back-end Functions

In [86]:
# we can use alignment for checking correct sentences. If classification = correct but not aligned, that means repetition or self-correct?
def match_score(alpha, beta, gap_penalty):
    """
    A function for determining the score between any two bases in alignment; helper function for needleman_wunsch
    :param alpha
    :param beta
    :param gap_penalty
    :return: mismatch_penalty (int)
    """
    match_award = 1
    mismatch_penalty = -1  # - scipy.spatial.distance.cosine(embeddings_map[alpha], embeddings_map[beta])
    if alpha == beta:
        return match_award
    elif alpha == '-' or beta == '-':
        return gap_penalty
    else:
        return mismatch_penalty

def needleman_wunsch(seq1, seq2):
    """
    :param seq1: First word sequence in list form (str[])
    :param seq2: Second word sequence in list form (str[])
    :return: Two lists corresponding to seq1 and seq2, respectively, that have placed the optimally aligned words in
    matching indices and padded using '-'.
    Example:
    Input
    seq1 = ['every', 'day', 'during', 'recess', 'mary', 'jason', 'and', 'their', 'classmates', 'played', 'together']
    seq2 = ['every', 'day', 'during', 'recess', 'mary', 'jason', 'and', 'the', 'their', 'classmates', 'played',
    'together', 'adam', 'day', 'played']
    Output
    align1 = ['every', 'day', 'during', 'recess', 'mary', 'jason', 'and', '-', 'their', 'classmates', 'played',
    'together', '-', '-', '-']
    align2 = ['every', 'day', 'during', 'recess', 'mary', 'jason', 'and', 'the', 'their', 'classmates', 'played',
    'together', 'adam', 'day', 'played']
    """
    # standard values
    gap_penalty = -1

    # store length of two sequences
    n = len(seq1)
    m = len(seq2)

    # generate matrix of zeros to store scores
    score = np.zeros((m + 1, n + 1))

    # calculate score table

    for i in range(0, m + 1):  # fill out first column
        score[i][0] = gap_penalty * i

    for j in range(0, n + 1):  # fill out first row
        score[0][j] = gap_penalty * j

    for i in range(1, m + 1):  # fill out all other values in the score matrix
        for j in range(1, n + 1):
            # calculate the score by checking the top, left, and diagonal cells
            match = score[i - 1][j - 1] + match_score(seq1[j - 1], seq2[i - 1], gap_penalty)
            delete = score[i - 1][j] + gap_penalty
            insert = score[i][j - 1] + gap_penalty

            # Record the maximum score from the three possible scores calculated above
            score[i][j] = max(match, delete, insert)

    # traceback and compute the alignment

    # create variables to store alignment
    align1 = []
    align2 = []

    # start from the bottom right cell in matrix
    i = m
    j = n

    # use i and j to keep track of where we are in the matrix, just like above
    while i > 0 and j > 0:  # end touching the top or the left edge
        score_current = score[i][j]
        score_diagonal = score[i - 1][j - 1]
        score_up = score[i][j - 1]
        score_left = score[i - 1][j]

        # figure out which cell the current score was calculated from, update i and j to correspond to that cell.
        if score_current == score_diagonal + match_score(seq1[j - 1], seq2[i - 1], gap_penalty):
            align1.append(seq1[j - 1])
            align2.append(seq2[i - 1])
            i -= 1
            j -= 1
        elif score_current == score_up + gap_penalty:
            align1.append(seq1[j - 1])
            align2.append('-')
            j -= 1
        elif score_current == score_left + gap_penalty:
            align1.append('-')
            align2.append(seq2[i - 1])
            i -= 1

    # finish tracing up to the top left cell
    while j > 0:
        align1.append(seq1[j - 1])
        align2.append('-')
        j -= 1
    while i > 0:
        align1.append('-')
        align2.append(seq2[i - 1])
        i -= 1

    # since we traversed the score matrix from the bottom right, our two sequences will be reversed.
    # these two lines reverse the order of the characters in each sequence.
    align1 = align1[::-1]
    align2 = align2[::-1]

    return align1, align2

def error(text, trans):
    error = []
    for i in range(len(text)):
        if text[i] != trans[i]:
            error.append(i)
    if not error:
        return "correct"
    return error

In [99]:
def classify_sent(text, trans):
  # convert string to list
  ltext = text.split(" ")
  ltrans = trans.split(" ")

  # align text and transcript
  aligned_text, aligned_trans = needleman_wunsch(ltext, ltrans)

  # find the index of errors
  error_index = error(aligned_text, aligned_trans)

  # if there is no error, return correct
  if (error_index == 'correct'):
    return error_index
  else:
    # find the words that have errors
    error_words = get_error_word(aligned_text, aligned_trans, error_index)
    # classify the errors
    return classify(error_words, ltext, ltrans)


Classify Front-End Data 

In [94]:
#front end data
with open('story_text.txt') as f:
    lines = f.readlines()

with open('transcript.txt') as f:
    transcript = f.readlines()

story_text = lines[0].split(". ")

for i in range(len(story_text)):
  story_text[i] = story_text[i].lower().translate(str.maketrans('', '', string.punctuation))

for i in range(len(transcript)):
  transcript[i] = transcript[i].lower().translate(str.maketrans('', '', string.punctuation)).replace("\n","").replace("  ", "")

data = {'story_text':story_text, 'transcript':transcript}
fdf = pd.DataFrame(data)

In [102]:
fdf['classify'] = fdf.apply(lambda x: classify_sent(x.story_text, x.transcript), axis = 1)
fdf

Unnamed: 0,story_text,transcript,classify
0,for spring break adam and his family were goin...,for spring break amanda and his family were go...,"[[3, amanda, miscue - substitution], [12, gran..."
1,just two days before the vacation he fell off ...,just two days before the vacation he feel off ...,"[[7, feel, miscue - unrelated chatter], [11, c..."
2,crack adam had never felt such a pain,bike crack amanda had never felt such a pain,"[[0, bike, miscue - substitution], [2, amanda,..."
3,adams mother took him to the hospital,felt such a pain edams mother took him to the ...,"[[0, felt, miscue - substitution], [1, such, m..."
4,he had to have x rays and get a shot,the hospital he had to have x rays and get a shot,"[[0, the, miscue - substitution], [1, hospital..."
5,after the x rays the doctor told adam his leg ...,after the x rays the doctor told edam his leg ...,"[[7, edam, miscue - unrelated chatter], [12, e..."
6,adam would have to spend a day in the hospital,edam would have to spend a day in the hospital,"[[0, edam, miscue - unrelated chatter]]"
7,the doctor put a cast on adams leg and said ad...,the doctor put a cast on edams leg and said ed...,"[[6, edams, miscue - unrelated chatter], [10, ..."
8,adam would also have to learn to walk with cru...,edam would also have to learn to walk without ...,"[[0, edam, miscue - unrelated chatter], [8, wi..."
9,the crutches were easy to use and adams friend...,crutch the crutch were easy to use and edams f...,"[[0, crutch, miscue - substitution], [2, crutc..."


In [118]:
# get a table about error
error = fdf['classify'].tolist()
sent_index = []
error_index = []
error_word = []
error_type = []
for s_index in range(len(error)):
  for es in error[s_index]:
    sent_index.append(s_index)
    error_index.append(es[0])
    error_word.append(es[1])
    error_type.append(es[2])

In [121]:
error_data = {'sent_index':sent_index, 'error_index':error_index, 'error_word':error_word,'error_type':error_type}
error_df = pd.DataFrame(error_data)
error_df

Unnamed: 0,sent_index,error_index,error_word,error_type
0,0,3,amanda,miscue - substitution
1,0,12,grandpa,miscue - substitution
2,1,7,feel,miscue - unrelated chatter
3,1,11,crack,miscue - substitution
4,1,12,amanda,miscue - substitution
...,...,...,...,...
67,18,12,it,miscue - substitution
68,18,13,would,miscue - substitution
69,18,14,be,miscue - substitution
70,19,0,at,miscue - substitution


In [124]:
# count error types
error_df['error_type'].value_counts()

miscue - substitution         57
miscue - unrelated chatter    14
correct - repetition           1
Name: error_type, dtype: int64

In [126]:
# list of substitution errors
error_df[error_df['error_type'] == 'miscue - substitution']['error_word']

0       amanda
1      grandpa
3        crack
4       amanda
5         bike
6       amanda
7         felt
8         such
9            a
10        pain
12          he
13         had
14         the
15    hospital
17        edam
22     without
23      crutch
24         the
25      crutch
26        were
27      crutch
31        help
32       visit
33         him
34        help
35        help
37          he
38     thought
39      spring
40        felt
41         bad
43        that
44       meant
45        days
46        edam
48        this
49        edam
50      filber
51         the
52        next
53         day
55         his
56         mam
57        went
58          at
59         the
60        door
61        adam
62      looked
63        open
64          it
66       smile
67          it
68       would
69          be
70          at
71         him
Name: error_word, dtype: object

In [127]:
# list of unrelated chatter errors
error_df[error_df['error_type'] == 'miscue - unrelated chatter']['error_word']

2       feel
11     edams
16      edam
18      edam
19     edams
20      edam
21      edam
28    crutch
29     edams
36      edam
42      edam
47     edams
54      edam
65      edam
Name: error_word, dtype: object

In [128]:
# list of repetition errors
error_df[error_df['error_type'] == 'correct - repetition']['error_word']

30    the
Name: error_word, dtype: object