In [147]:
import pandas as pd
import string
import re
import numpy as np

In [148]:
#data processing
df = pd.read_excel("/Users/tduong/OneDrive - TnS/FALL 2021/Amira Learning/Data/Transcripts.xlsx")
df = df[['activity_id', 'story_text', 'expected_word', 'transcript', 'label - Ran\'s Examples']]


In [149]:
# we can use alignment for checking correct sentences. If classification = correct but not aligned, that means repetition or self-correct?
def match_score(alpha, beta, gap_penalty):
    """
    A function for determining the score between any two bases in alignment; helper function for needleman_wunsch
    :param alpha
    :param beta
    :param gap_penalty
    :return: mismatch_penalty (int)
    """
    match_award = 1
    mismatch_penalty = -1  # - scipy.spatial.distance.cosine(embeddings_map[alpha], embeddings_map[beta])
    if alpha == beta:
        return match_award
    elif alpha == '-' or beta == '-':
        return gap_penalty
    else:
        return mismatch_penalty

def needleman_wunsch(seq1, seq2):
    """
    :param seq1: First word sequence in list form (str[])
    :param seq2: Second word sequence in list form (str[])
    :return: Two lists corresponding to seq1 and seq2, respectively, that have placed the optimally aligned words in
    matching indices and padded using '-'.
    Example:
    Input
    seq1 = ['every', 'day', 'during', 'recess', 'mary', 'jason', 'and', 'their', 'classmates', 'played', 'together']
    seq2 = ['every', 'day', 'during', 'recess', 'mary', 'jason', 'and', 'the', 'their', 'classmates', 'played',
    'together', 'adam', 'day', 'played']
    Output
    align1 = ['every', 'day', 'during', 'recess', 'mary', 'jason', 'and', '-', 'their', 'classmates', 'played',
    'together', '-', '-', '-']
    align2 = ['every', 'day', 'during', 'recess', 'mary', 'jason', 'and', 'the', 'their', 'classmates', 'played',
    'together', 'adam', 'day', 'played']
    """
    # standard values
    gap_penalty = -1

    # store length of two sequences
    n = len(seq1)
    m = len(seq2)

    # generate matrix of zeros to store scores
    score = np.zeros((m + 1, n + 1))

    # calculate score table

    for i in range(0, m + 1):  # fill out first column
        score[i][0] = gap_penalty * i

    for j in range(0, n + 1):  # fill out first row
        score[0][j] = gap_penalty * j

    for i in range(1, m + 1):  # fill out all other values in the score matrix
        for j in range(1, n + 1):
            # calculate the score by checking the top, left, and diagonal cells
            match = score[i - 1][j - 1] + match_score(seq1[j - 1], seq2[i - 1], gap_penalty)
            delete = score[i - 1][j] + gap_penalty
            insert = score[i][j - 1] + gap_penalty

            # Record the maximum score from the three possible scores calculated above
            score[i][j] = max(match, delete, insert)

    # traceback and compute the alignment

    # create variables to store alignment
    align1 = []
    align2 = []

    # start from the bottom right cell in matrix
    i = m
    j = n

    # use i and j to keep track of where we are in the matrix, just like above
    while i > 0 and j > 0:  # end touching the top or the left edge
        score_current = score[i][j]
        score_diagonal = score[i - 1][j - 1]
        score_up = score[i][j - 1]
        score_left = score[i - 1][j]

        # figure out which cell the current score was calculated from, update i and j to correspond to that cell.
        if score_current == score_diagonal + match_score(seq1[j - 1], seq2[i - 1], gap_penalty):
            align1.append(seq1[j - 1])
            align2.append(seq2[i - 1])
            i -= 1
            j -= 1
        elif score_current == score_up + gap_penalty:
            align1.append(seq1[j - 1])
            align2.append('-')
            j -= 1
        elif score_current == score_left + gap_penalty:
            align1.append('-')
            align2.append(seq2[i - 1])
            i -= 1

    # finish tracing up to the top left cell
    while j > 0:
        align1.append(seq1[j - 1])
        align2.append('-')
        j -= 1
    while i > 0:
        align1.append('-')
        align2.append(seq2[i - 1])
        i -= 1

    # since we traversed the score matrix from the bottom right, our two sequences will be reversed.
    # these two lines reverse the order of the characters in each sequence.
    align1 = align1[::-1]
    align2 = align2[::-1]

    return align1, align2

In [150]:
#process variables for alignment
df = df[['activity_id', 'story_text', 'transcript']].drop_duplicates()
df['story_text'] = df['story_text'].str.lower() 
df['story_text'] = df.apply(lambda x: re.sub(r'[^\w\s]','',x.story_text), axis=1)
df['story_text'] = df['story_text'].str.split(' ')
df['transcript'] = df['transcript'].str.split(' ')

In [151]:
# align story_text with transcript
df['aligned_text_transcript'] = df.apply(lambda x: needleman_wunsch(x.story_text, x.transcript), axis=1)
df['aligned_text'], df['aligned_transcript'] = df.aligned_text_transcript.str


  df['aligned_text'], df['aligned_transcript'] = df.aligned_text_transcript.str


In [152]:
# a method that return the index of the errors
def error(text, trans):
    error = []
    for i in range(len(text)):
        if text[i] != trans[i]:
            error.append(i)
    if not error:
        return "correct"
    return error

In [154]:
# clasify correct and incorrect sentences
df['error'] = df.apply(lambda df: error(df.aligned_text, df.aligned_transcript), axis = 1)
df[df.error != "correct"]
#export errors
df[df.error != "correct"].to_excel("classify-error.xlsx") 

In [None]:
# classify errors
# skip - when there is a '-' in the transcript and 


 

In [252]:
# def isCorrectSent(wordList):
#     for word in wordList:
#         if word == False:
#             return False
#     return True
    
# def ensureUtf(s):
#   try:
#       if type(s) == unicode:
#         return s.encode('utf8', 'ignore')
#   except: 
#     return str(s)
    
# def skipOrPr(similarity):
#     for x in similarity:
#         if x == True:
#             return "mispronounciation"
#     return "skip"

# from pyphonetics import RefinedSoundex
# from pyphonetics import Soundex

# def isSimilar(wordList, word2check):
#     sd = Soundex()
#     result = []
#     for word in wordList:
#         result.append(sd.sounds_like(ensureUtf(word), ensureUtf(word2check)))
#     return result

In [253]:
# # if isCorrectSent then return no error or self-correction, else do the classification of skip or mispronunciation
# df['isIn'] = df.apply(lambda x: x.expected_word in x.transcript, axis = 1)
# #line up the words with algorithm
# df['transcript'] = df['transcript'].str.split(' ')
# #classify skip and classification
# df_wrong = df[df.isIn == False]
# #calculate the similarity between words 
# df_wrong['similarity'] = df_wrong.apply(lambda x: isSimilar(x.transcript, x.expected_word), axis = 1)
# df_wrong['classification'] = df_wrong.apply(lambda x: skipOrPr(x.similarity), axis = 1)
# #combine skip and mispronunciation classification to the dataset
# df_combine = df.merge(df_wrong[['activity_id','expected_word','classification']], on = ['activity_id','expected_word'], how = 'left')
# df_combine.head(15)
# #if classification is NaN, that means it is correct or self-correction
# df_combine['classification'] = df_combine['classification'].apply(lambda x: "correct" if pd.isnull(x) else x)
# df_combine.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wrong['similarity'] = df_wrong.apply(lambda x: isSimilar(x.transcript, x.expected_word), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wrong['classification'] = df_wrong.apply(lambda x: skipOrPr(x.similarity), axis = 1)


Unnamed: 0,activity_id,story_text,expected_word,transcript,label - Ran's Examples,isIn,classification
0,C5D712A91AEC11EC89641635D14828A5,"Ready, set, go!",ready,"[ready, set, go]",correct - no category,True,correct
1,C5D712A91AEC11EC89641635D14828A5,"Ready, set, go!",set,"[ready, set, go]",correct - no category,True,correct
2,C5D712A91AEC11EC89641635D14828A5,"Ready, set, go!",go,"[ready, set, go]",correct - no category,True,correct
3,E890F6601C7511EC89641635D14828A5,"No, it wasn't there.",no,"[no, it, was, there]",correct - no category,True,correct
4,E890F6601C7511EC89641635D14828A5,"No, it wasn't there.",it,"[no, it, was, there]",correct - no category,True,correct
...,...,...,...,...,...,...,...
95,63FC3ED71AE811EC89641635D14828A5,How I wonder what you are!,wonder,"[how, i, wonder, what, you, are]",correct - no category,True,correct
96,63FC3ED71AE811EC89641635D14828A5,How I wonder what you are!,what,"[how, i, wonder, what, you, are]",correct - no category,True,correct
97,63FC3ED71AE811EC89641635D14828A5,How I wonder what you are!,you,"[how, i, wonder, what, you, are]",correct - no category,True,correct
98,63FC3ED71AE811EC89641635D14828A5,How I wonder what you are!,are,"[how, i, wonder, what, you, are]",correct - no category,True,correct
