In [1]:
import os
from collections import defaultdict
import process_childes

In [2]:
test_file = "Mandarin/ChangPlay/3/under45ut/001.cha"
test_folder = "Mandarin"

In [3]:
child_utts = process_childes.extract_child_utterances_by_age(test_file)
for age, utts in child_utts.items():
    print(f"Age {age}: {len(utts)} utterances")
    print("Sample:", utts[:2])

Age 3: 43 utterances
Sample: ['這 可以 裝 東西 .', 'xxx .']


In [4]:
adult_utts = process_childes.extract_adult_utterances(test_file)
print(f"Adult utterances: {len(adult_utts)}")
print("Sample:", adult_utts[:2])

Adult utterances: 202
Sample: ['那 個 [/] 那個 要 怎樣 玩 ?', '你 教 媽媽 啊 .']


In [5]:
combined = process_childes.extract_utterances_by_age_and_adult(test_file)
print(f"Child groups: {list(combined['child'].keys())}")
print(f"Adult utterances: {len(combined['adult'])}")

Child groups: [3]
Adult utterances: 202


In [6]:
child_gra = process_childes.get_child_utterance_gra_by_age(test_file)
for age, pairs in child_gra.items():
    print(f"Age {age}: {len(pairs)} utterance+gra pairs")
    print("Sample:", pairs[:1])


Age 3: 43 utterance+gra pairs
Sample: [('這 可以 裝 東西 .', '1|3|SUBJ 2|3|AUX 3|0|ROOT 4|3|OBJ 5|3|PUNCT')]


In [7]:
adult_gra = process_childes.get_adult_utterance_gra(test_file)
print(f"Adult utterance+gra pairs: {len(adult_gra)}")
print("Sample:", adult_gra[:1])

Adult utterance+gra pairs: 185
Sample: [('那 個 [/] 那個 要 怎樣 玩 ?', '1|3|DET 2|1|SFP 3|0|ROOT 4|3|OBJ 5|3|SRL 6|3|PUNCT')]


In [8]:
combined_gra = process_childes.get_all_utterance_gra_by_group(test_file)
print(f"Child ages: {list(combined_gra['child'].keys())}")
print(f"Adult pairs: {len(combined_gra['adult'])}")

Child ages: [3]
Adult pairs: 185


In [9]:
folder_utts = process_childes.extract_utterances_by_age_and_adult_folder(test_folder)
print(f"Folder-level child ages: {list(folder_utts['child'].keys())}")
print(f"Folder-level adult utterances: {len(folder_utts['adult'])}")

Folder-level child ages: [4, 5, 6, 3, 2, 7, 8, 9, 10, 1, 0]
Folder-level adult utterances: 413087


In [10]:
folder_gra = process_childes.get_all_utterance_gra_by_group_folder(test_folder)
print(f"Folder-level child ages: {list(folder_gra['child'].keys())}")
print(f"Folder-level adult utterance+gra pairs: {len(folder_gra['adult'])}")

Folder-level child ages: [4, 5, 6, 3, 2, 7, 8, 9, 10, 1, 0]
Folder-level adult utterance+gra pairs: 308322


In [11]:
folder_gra['child'][5]

[('嗯 嗯 . \x151097_2217\x15', '1|0|INCROOT 2|1|SFP 3|1|PUNCT'),
 ('我 不 知道 . \x153089_5489\x15', '1|3|SUBJ 2|3|JCT 3|0|ROOT 4|3|PUNCT'),
 ('醫生 的 東西 . \x1513161_15241\x15', '1|3|MOD 2|1|POSS 3|0|INCROOT 4|3|PUNCT'),
 ('嗯 . \x1521913_22577\x15', '1|0|INCROOT 2|1|PUNCT'),
 ('嗯 . \x1532361_33393\x15', '1|0|INCROOT 2|1|PUNCT'),
 ('嗯 . \x1533393_34737\x15', '1|0|INCROOT 2|1|PUNCT'),
 ('嗯 . \x1544738_46485\x15', '1|0|INCROOT 2|1|PUNCT'),
 ('<這 是 > [/] 這 是 <一 個 > [/] 一 個 東西 . \x1549429_62781\x15',
  '1|2|SUBJ 2|0|ROOT 3|4|QUANT 4|5|CLASS 5|2|PRED 6|2|PUNCT'),
 ('因為 看 醫生 就 要 用 冰 的 東西 . \x1565434_74562\x15',
  '1|2|JCT 2|0|ROOT 3|2|OBJ 4|6|JCT 5|6|AUX 6|2|SRL 7|9|MOD 8|7|POSS'),
 ('發燒 . \x1584337_85601\x15', '1|0|ROOT 2|1|PUNCT'),
 ('看 用 這 個 . \x1589441_93433\x15',
  '1|0|ROOT 2|1|JCT 3|4|DET 4|2|PREPO 5|1|PUNCT'),
 ('打 耳朵 . \x1594425_96497\x15', '1|0|ROOT 2|1|OBJ 3|1|PUNCT'),
 ('四 十 . \x15122433_122937\x15', '1|2|QUANT 2|0|INCROOT 3|2|PUNCT'),
 ('也 有 . \x15131825_133377\x15', '1|2|JCT 2|0|ROOT 3|

In [14]:
cleaned_simple_data = process_childes.clean_chinese_utterances_simple(folder_utts)
# Simple version
for age, items in cleaned_simple_data["child"].items():
    print(f"Child age {age}: {items[:2]}")
    break

print("Adult:", cleaned_simple_data["adult"][:2])

Child age 4: ['太 簡單 了', '太 簡單 了']
Adult: ['太 難 的 你 又 不 會', '太 簡單']


In [17]:
cleaned_gra_data = process_childes.clean_chinese_utterances(folder_gra)
                                                                     # GRA version
for age, items in cleaned_gra_data["child"].items():
    print(f"Child age {age}: {items[:2]}")
    break

print("Adult:", cleaned_gra_data["adult"][:2])


Child age 4: [('太 簡單 了', '1|2|JCT 2|0|INCROOT 3|2|ASP 4|2|PUNCT'), ('太 簡單 了', '1|2|JCT 2|0|INCROOT 3|2|ASP 4|2|PUNCT')]
Adult: [('太 難 的 你 又 不 會', '1|2|JCT 2|0|BEG 3|2|LINK 4|2|BEGP 5|8|SUBJ 6|8|JCT 7|8|JCT 8|0|ROOT'), ('太 簡單', '1|2|JCT 2|0|INCROOT 3|2|PUNCT')]
