In [3]:
#!/usr/bin/env python3
# coding: utf-8
""" Usage:
    python3 thai-student-stat.py directory
"""

' Usage:\n    python3 thai-student-stat.py directory\n'

In [4]:
import glob
import tgt
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
import edit_distance

VOWELS=['i', 'y', 'u', 'ɨ', 'ɤ', 'e', 'ɛ', 'ə', 'o',  'ɔ',  'a', 'ɚ']
CONSONS= ['p', 'pʰ', 't', 'tʰ', 'k', 'kʰ', 'ts', 'tsʰ', 'tʂ', 'tʂʰ', 'tɕ', 'tɕʰ', 'f', 'ɕ', 'x', 's', 
          'ʂ', 'ʐ', 'm', 'n', 'ŋ', 'l', 'ɥ', 'j', 'w']
LABELS = VOWELS+CONSONS
#print(CONSONS)

In [25]:
def contains_chinese(check_str):
    """
    判断字符串中是否包含中文
    :param check_str: {str} 需要检测的字符串
    :return: {bool} 包含返回True， 不包含返回False
    """
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

def all_chinese(check_str):
    for ch in check_str:
        if not (u'\u4e00' <= ch <= u'\u9fff'):
            return False
    return True

In [34]:
def length_main(txtgridr):
    #print(txtgridr)
    txtlist = glob.glob(txtgridr+"/*.14")
    print("File list", txtlist)
    tlen=0
    tsilen=0
    slen=0
    ssilen=0
    for txtgridf in txtlist:
        tg = tgt.io.read_textgrid(txtgridf, encoding='utf-8', include_empty_intervals=True)
        t1=tg.get_tier_by_name('IU/teacher')
        t7=tg.get_tier_by_name('IU/student-7')

        for i in t1:
            sta = i.start_time
            fin = i.end_time
            txt = i.text.replace(' ','')
            if txt == 'SIL':
                tsilen=tsilen+fin-sta
                #print(txt,fin,sta)
            else:
                tlen = tlen + fin - sta
                #print(txt, fin, sta)
        for i in t7:
            sta = i.start_time
            fin = i.end_time
            txt = i.text.replace(' ','')
            if txt == 'SIL':
                ssilen=ssilen+fin-sta
                #print(txt,fin,sta)
            else:
                slen = slen + fin - sta
                #print(txt, fin, sta)
    print("Teacher talk:", tlen, "sec, SIL", tsilen, "sec")
    print("Student talk:", slen, "sec, SIL", ssilen, "sec")

In [35]:
def words_main(txtgridr):
    #print(txtgridr)
    txtlist = glob.glob(txtgridr+"/*.14")
    print("File list", txtlist)
    t1count=0
    t2count=0
    s1count=0
    s2count=0
    for txtgridf in txtlist:
        tg = tgt.io.read_textgrid(txtgridf, encoding='utf-8', include_empty_intervals=True)
        t1=tg.get_tier_by_name('IU/teacher')
        t7=tg.get_tier_by_name('IU/student-7')

        for i in t1:
            txt = i.text.replace('*','').replace('^','').replace(' ','')
            if all_chinese(txt):
                if len(txt) == 1:
                    t1count = t1count + 1
                if len(txt) == 2:
                    t2count = t2count + 1
        for i in t7:
            txt = i.text.replace('*','').replace('^','').replace(' ','')
            if all_chinese(txt):
                if len(txt) == 1:
                    s1count = s1count + 1
                if len(txt) == 2:
                    s2count = s2count + 1

    print("Teacher single word:", t1count, "2-words", t2count)
    print("Student single word:", s1count, "2-words", s2count)

In [36]:
words_main('data')

File list ['data\\ZOOM0001_LR.TextGrid.14', 'data\\ZOOM0002_LR.TextGrid.14', 'data\\ZOOM0003_LR.TextGrid.14', 'data\\ZOOM0004_LR.TextGrid.14', 'data\\ZOOM0004_做實驗.TextGrid.14', 'data\\ZOOM0005_LR-1.TextGrid.14', 'data\\ZOOM0005_LR.TextGrid.14', 'data\\ZOOM0007_Jinghuang.TextGrid.14', 'data\\ZOOM0007_LR.TextGrid.14', 'data\\ZOOM0007_LR_TextGrid.14', 'data\\ZOOM0008_test_one_by_one.TextGrid.14', 'data\\ZOOM0009_LR.TextGrid.14']
Teacher single word: 326 2-words 416
Student single word: 272 2-words 604


In [37]:
length_main('data')


File list ['data\\ZOOM0001_LR.TextGrid.14', 'data\\ZOOM0002_LR.TextGrid.14', 'data\\ZOOM0003_LR.TextGrid.14', 'data\\ZOOM0004_LR.TextGrid.14', 'data\\ZOOM0004_做實驗.TextGrid.14', 'data\\ZOOM0005_LR-1.TextGrid.14', 'data\\ZOOM0005_LR.TextGrid.14', 'data\\ZOOM0007_Jinghuang.TextGrid.14', 'data\\ZOOM0007_LR.TextGrid.14', 'data\\ZOOM0007_LR_TextGrid.14', 'data\\ZOOM0008_test_one_by_one.TextGrid.14', 'data\\ZOOM0009_LR.TextGrid.14']
Teacher talk: 4296.573470127137 sec, SIL 3829.516688603022 sec
Student talk: 2106.045540314407 sec, SIL 6020.04461841571 sec
