In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
# import sys
# sys.path.append('/content/gdrive/MyDrive/Colab/punktuation-ner')

In [2]:
!pip install forgebox==0.4.18.5 pytorch_lightning sklearn-model

Collecting forgebox==0.4.18.5
  Downloading forgebox-0.4.18.5-py3-none-any.whl.metadata (9.2 kB)
Collecting pytorch_lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting sklearn-model
  Downloading sklearn_model-0.0.6-py3-none-any.whl.metadata (2.7 kB)
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.4.1-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)
Downloading forgebox-0.4.18.5-py3-none-any.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.8/92.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_model-0.0.6-py3-none-any.whl (6.6 kB)
Downloading lightni

# Punctuation NER

In [3]:
# Forgebox Imports
from forgebox.imports import *
from forgebox.category import Category
import pytorch_lightning as pl
from transformers import AutoTokenizer, BertForTokenClassification, BertModel
from transformers import pipeline
from typing import List
import re
from torch.utils.data import DataLoader, Dataset

In [4]:
DATA = r'/content/gdrive/MyDrive/Colab/punktuation-ner/data'

In [79]:
# In data/couplets/Chinese_couplet_dataset_sample_2K.tsv, the format is as follows:
# first,second,label
# 明 有 通 人 著 赤 雅                                                                            ,汉 称 孝 子 褒 黄 香,0000000
# 栖 霞 山 上 栖 霞 寺                                                                            ,建 业 城 中 建 业 人,0000000
# 修 身 如 执 玉                                                                                  ,行 善 胜 遗 金,00000
# 闲 云 归 岫 连 峰 暗                                                                            ,飞 瀑 垂 空 漱 石 凉,0000000
# 雪 寂 春 薄 ， 谁 怜 取 寒 香 一 抹                                                             ,夜 深 人 静 ， 伊 独 饮 醉 苦 千 般,000010000000

# Load the data and use train_test_split to split the data into training and validation sets, save the data to the data folder
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(f"{DATA}/Chinese_couplet_dataset_sample_2K.tsv", encoding="utf-8", sep=",")

# Remove the space in the first, second
df["first"] = df["first"].apply(lambda x: x.replace(" ", ""))
df["second"] = df["second"].apply(lambda x: x.replace(" ", ""))

print("Length before: " + str(len(df)))
df = df.groupby('label').filter(lambda x: len(x) > 1)
print("Length after: " + str(len(df)))

# # Combine the first and second columns into a new column text
df["text"] = df["first"] + '\n' + df["second"]

# # Remove the first, second columns
df = df.drop(columns=["first", "second"])

df_train, df_val = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])

with open(f"{DATA}/split/train.txt", "w") as f:
    for text in df_train["text"]:
        f.write(text + "\n")

with open(f"{DATA}/split/valid.txt", "w") as f:
    for text in df_val["text"]:
        f.write(text + "\n")

Length before: 2000
Length after: 1909


## Read Metadata

In [81]:
# META = pd.read_csv(DATA/"meta.csv")

In [82]:
LABELS = ['train.txt', 'valid.txt']

In [83]:
punkt_regex = r'[^\w\s]'

def position_of_all_punctuation(x):
    return [m.start() for m in re.finditer(punkt_regex, x)]

# simplify the punctuation
eng_punkt_to_cn_dict = {
    ".": "。",
    ",": "，",
    ":": "：",
    ";": "；",
    "?": "？",
    "!": "！",
    "“": "\"",
    "”": "\"",
    "‘": "\'",
    "’": "\'",
    "「": "（",
    "」": "）",
    "『": "\"",
    "』": "\"",
    "（": "（",
    "）": "）",
    "《": "【",
    "》": "】",
    "［": "【",
    "］": "】",
    }

def translate_eng_punkt_to_cn(char):
    if char == "0":
        return char
    if char in eng_punkt_to_cn_dict.values():
        return char
    result = eng_punkt_to_cn_dict.get(char)
    if result is None:
        return "。"
    return result

def punct_ner_pair(sentence):
    positions = position_of_all_punctuation(sentence)
    x = re.sub(punkt_regex, '', sentence)
    y = list("0"*len(x))

    for i, p in enumerate(positions):
        y[p-i-1] = sentence[p]
    p_df = pd.DataFrame({"x":list(x), "y":y})
    p_df["y"] = p_df["y"].apply(translate_eng_punkt_to_cn)
    return p_df

In [84]:
ALL_LABELS = ["0",] +list(eng_punkt_to_cn_dict.values())

In [85]:
print(ALL_LABELS)

['0', '。', '，', '：', '；', '？', '！', '"', '"', "'", "'", '（', '）', '"', '"', '（', '）', '【', '】', '【', '】']


In [86]:
cates = Category(ALL_LABELS)

In [87]:
class PunctDataset(Dataset):
    def __init__(
        self,
        data_dir: Path,
        filelist: List[str],
        num_threads: int = 8,
        length: int = 1000,
        size: int = 540
    ):
        """
        Args:
            - filelist: list of file names
            - The dataset will open ```num_threads``` files, and hold
                in memory simoultaneously.
            - num_threads: number of threads to read files,
            - length: number of sentences per batch
            - size: number of characters per sentence
        """
        self.data_dir = Path(data_dir)
        self.filelist = filelist
        self.num_threads = num_threads
        self.length = length
        # open file strings, index is mod of num_threads
        self.current_files = dict(enumerate([""]*length))
        self.string_index = dict(enumerate([0]*length))
        self.to_open_idx = 0
        self.size = size
        self.get_counter = 0
        self.return_string = False

    def __len__(self):
        return self.length

    def __repr__(self):
        return f"PunctDataset: {len(self)}, on {len(self.filelist)} files"

    def new_file(self, idx_mod):
        filename = self.filelist[self.to_open_idx]
        with open(self.data_dir/filename, "r", encoding="utf-8") as f:
            self.current_files[idx_mod] = f.read()

        self.to_open_idx += 1

        # reset to open article file index
        if self.to_open_idx >= len(self.filelist):
            self.to_open_idx = 0

        # reset string_index within new article file
        self.string_index[idx_mod] = 0

        # if self.to_open_idx % 500 == 0:
        #     print(f"went through files:\t{self.to_open_idx}")

    def __getitem__(self, idx):
        idx_mod = self.get_counter % self. num_threads

        if self.string_index[idx_mod] >= len(self.current_files[idx_mod]):
            self.new_file(idx_mod)
        string_idx = self.string_index[idx_mod]

        # slicing a sentence
        sentence = self.current_files[idx_mod][string_idx:string_idx+self.size]

        # move the string_index within current article file
        self.string_index[idx_mod] += self.size

        # move the get_counter
        self.get_counter += 1
        p_df = punct_ner_pair(sentence)
        return list(p_df.x), list(p_df.y)

    def align_offsets(
        self,
        inputs,
        text_labels: List[List[str]],
        words: List[List[str]]
    ):
        """
        inputs: output if tokenizer
        text_labels: labels in form of list of list of strings
        words: words in form of list of list of strings
        """
        labels = torch.zeros_like(inputs.input_ids).long()
        labels -= 100
        text_lables_array = np.empty(labels.shape, dtype=object)
        words_array = np.empty(labels.shape, dtype=object)
        max_len = inputs.input_ids.shape[1]

        # print("Input_ids: ", inputs.input_ids)

        # print("Max Len: ", max_len)

        # print("Text Labels: ", text_labels)

        for row_id, input_ids in enumerate(inputs.input_ids):
            word_pos = inputs.word_ids(row_id)
            # print("Word Pos: ", word_pos)
            for idx, pos in enumerate(word_pos):
                # print("index: ", idx)
                if pos is None:
                    # print("Pos is None")
                    continue
                labels[row_id, idx] = self.cates.c2i[text_labels[row_id][pos]]
                if self.return_string:
                    text_lables_array[row_id,
                                        idx] = text_labels[row_id][pos]
                    words_array[row_id, idx] = words[row_id][pos]

        inputs['labels'] = labels

        if self.return_string:
            inputs['text_labels'] = text_lables_array.tolist()
            inputs['word'] = words_array.tolist()

        # for input_id in inputs['input_ids']:
        #     print("InPuT_iD: ", input_id)
        #     print("Word: ", self.tokenizer.convert_ids_to_tokens(input_id))
        #     print("Word Pos: ", inputs.word_ids(0))
        # for label in inputs['labels']:
        #     print("Label: ", label)

        return inputs

    def collate_fn(self, data):
        """
        data: list of tuple
        """
        words, text_labels = zip(*data)

        inputs = self.tokenizer(
            list(words),
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=True,
            return_offsets_mapping=True,
            add_special_tokens=False,
        )

        # print("Original words", words)

        return self.align_offsets(inputs, text_labels, words)

    def dataloaders(self, tokenizer, cates, max_len: int = 512, batch_size: int = 32):
        self.tokenizer = tokenizer
        self.cates = cates
        self.max_len = max_len
        return DataLoader(
            self,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=self.collate_fn,
        )

    def split(self, ratio: float = 0.9, n_splits: int = 5):
      """
      Split the dataset into train and valid using StratifiedKFold
      """

      train_filelist = ["train.txt"]
      valid_filelist = ["valid.txt"]

      train_dataset = PunctDataset(
          self.data_dir,
          train_filelist,
          num_threads=self.num_threads,
          length=int(self.length * ratio),
          size=self.size,
      )
      valid_dataset = PunctDataset(
          self.data_dir,
          valid_filelist,
          num_threads=self.num_threads,
          length=int(self.length * (1 - ratio)),
          size=self.size,
      )
      return train_dataset, valid_dataset

Create dataset object

* Length is the length of the epoch
* Size: is the sequence length
* num_threads: num of files that is opening at the same time

In [88]:
ds = PunctDataset(DATA + '/split', LABELS, num_threads=1, length=7271, size=512)
train_ds, valid_ds = ds.split(0.8)

### lightning data module

In [89]:
class PunctDataModule(pl.LightningDataModule):
    def __init__(self, train_ds, valid_ds, tokenizer, cates,
    max_len=512, batch_size=32):
        super().__init__()
        self.train_ds, self.valid_ds = train_ds, valid_ds
        self.tokenizer = tokenizer
        self.cates = cates
        self.max_len = max_len
        self.batch_size = batch_size

    def split_data(self):

        return train_ds, valid_ds

    def train_dataloader(self):
        return self.train_ds.dataloaders(
            self.tokenizer,
            self.cates,
            self.max_len,
            self.batch_size,
        )

    def val_dataloader(self):
        return self.valid_ds.dataloaders(
            self.tokenizer,
            self.cates,
            self.max_len,
            self.batch_size*4)

## Load Pretrained

In [90]:
tokenizer = AutoTokenizer.from_pretrained("raynardj/classical-chinese-punctuation-guwen-biaodian")



In [91]:
from forgebox.thunder.callbacks import DataFrameMetricsCallback
from forgebox.hf.train import NERModule

In [92]:


# Define BERT-CRF Model using PyTorch Lightning
class BERT_CRF(pl.LightningModule):
    def __init__(self, model, num_labels, learning_rate=1e-5):
        super().__init__()
        self.save_hyperparameters(ignore=['model'])

        self.model = model
        self.crf = CRF(num_labels, batch_first=True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.model.config.hidden_size, num_labels)
        self.lr = learning_rate

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        sequence_output = output.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.fc(sequence_output)
        if labels is not None:
            loss = -self.crf(logits, labels, mask=attention_mask.bool(), reduction='mean')
            return {"loss": loss, "logits": logits}
        else:
            return {"logits": logits}

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, logits = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss)
        self.log('train_acc', self.crf.accuracy(logits, labels, attention_mask.byte()))
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, logits = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss)
        self.log('val_acc', self.crf.accuracy(logits, labels, attention_mask.byte()))
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer


Load pretrained model with proper num of categories

In [93]:
model = BertForTokenClassification.from_pretrained("raynardj/classical-chinese-punctuation-guwen-biaodian", num_labels=len(cates),)

In [94]:
print(train_ds.length)
print(valid_ds.length)

5816
1454


In [95]:
data_module = PunctDataModule(train_ds, valid_ds, tokenizer, cates,
                              batch_size=32,)

### Run data pipeline

In [96]:
inputs = next(iter(data_module.val_dataloader()))
print(inputs.input_ids)

tensor([[3189, 4212, 7198,  ...,    0,    0,    0],
        [6843, 4242, 1898,  ...,    0,    0,    0],
        [7412, 5782, 2255,  ..., 7491,    0,    0],
        ...,
        [3717,  756, 5682,  ..., 7414,    0,    0],
        [6816,  881,  782,  ...,    0,    0,    0],
        [4080, 3958, 3726,  ...,    0,    0,    0]])


In [97]:
inputs.input_ids.shape

torch.Size([128, 439])

In [98]:
inputs.labels.shape

torch.Size([128, 439])

## NER tranining module

In [99]:
module = NERModule(model)

In [100]:
save_callback = pl.callbacks.ModelCheckpoint(
    dirpath=f"/content/gdrive/MyDrive/Colab/punktuation-ner/ckpoint",
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min',
)
df_show = DataFrameMetricsCallback()

Reset the configure_optimizers function

In [101]:
def configure_optimizers(self):
        # discriminative learning rate
    param_groups = [
            {'params': self.model.bert.parameters(), 'lr': 5e-6},
            {'params': self.model.classifier.parameters(), 'lr': 1e-3},
        ]
    optimizer = torch.optim.Adam(param_groups, lr=1e-3)
    return optimizer

NERModule.configure_optimizers = configure_optimizers

In [102]:
import torch
print(torch.cuda.is_available())

True


Trainer

In [103]:
trainer = pl.Trainer(
    accelerator='gpu',
    devices=1,
    max_epochs=30,
    callbacks=[df_show, save_callback],
    )

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [104]:
trainer.fit(module, datamodule=data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | BertForTokenClassification | 101 M  | eval
------------------------------------------------------------
101 M     Trainable params
0         Non-trainable params
101 M     Total params
406.773   Total estimated model params size (MB)
0         Modules in train mode
228       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 182: 'val_loss' reached 0.34783 (best 0.34783), saving model to '/content/gdrive/MyDrive/Colab/punktuation-ner/ckpoint/epoch=0-step=182.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 364: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 546: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 728: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 910: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 5, global step 1092: 'val_loss' was not in top 1
INFO:pytorch_lightning.utilities.rank_zero:
Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

## Load the best model

In [105]:
module = NERModule.load_from_checkpoint(save_callback.best_model_path, model=model)

In [106]:
module.model.config.id2label = dict(enumerate(cates.i2c))
module.model.config.label2id = cates.c2i.dict

In [107]:
from transformers import pipeline

In [108]:
module.model = module.model.eval()
module.model = module.model.cpu()

In [None]:
# prompt: Store the model

torch.save(module.model, '/content/gdrive/MyDrive/Colab/punktuation-ner/best_ckpoint/punct_model_2.pth')


In [109]:
ner = pipeline("ner",module.model,tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [111]:
def mark_sentence(x: str):
    outputs = ner(x)
    print(outputs)
    print("hello")
    x_list = list(x)
    for i, output in enumerate(outputs):
        x_list.insert(output['end']+i, output['entity'])
    return "".join(x_list)

In [114]:
mark_sentence("前向香江壹帶澄清來活水")

[{'entity': '0', 'score': 0.99145985, 'index': 1, 'word': '前', 'start': 0, 'end': 1}, {'entity': '0', 'score': 0.9987406, 'index': 2, 'word': '向', 'start': 1, 'end': 2}, {'entity': '0', 'score': 0.9999815, 'index': 3, 'word': '香', 'start': 2, 'end': 3}, {'entity': '0', 'score': 0.9998882, 'index': 4, 'word': '江', 'start': 3, 'end': 4}, {'entity': '0', 'score': 0.99993336, 'index': 5, 'word': '壹', 'start': 4, 'end': 5}, {'entity': '，', 'score': 0.543045, 'index': 6, 'word': '帶', 'start': 5, 'end': 6}, {'entity': '0', 'score': 0.99981743, 'index': 7, 'word': '澄', 'start': 6, 'end': 7}, {'entity': '0', 'score': 0.99996185, 'index': 8, 'word': '清', 'start': 7, 'end': 8}, {'entity': '0', 'score': 0.9815839, 'index': 9, 'word': '來', 'start': 8, 'end': 9}, {'entity': '0', 'score': 0.9999523, 'index': 10, 'word': '活', 'start': 9, 'end': 10}, {'entity': '0', 'score': 0.9094957, 'index': 11, 'word': '水', 'start': 10, 'end': 11}]
hello


'前0向0香0江0壹0帶，澄0清0來0活0水0'

In [113]:
mark_sentence("""郡邑置夫子庙于学以嵗时释奠盖自唐贞观以来未之或改我宋有天下因其制而损益之姑苏当浙右要区规模尤大更建炎戎马荡然无遗虽修学宫于荆榛瓦砾之余独殿宇未遑议也每春秋展礼于斋庐已则置不问殆为阙典今寳文阁直学士括苍梁公来牧之明年实绍兴十有一禩也二月上丁修祀既毕乃愓然自咎揖诸生而告之曰天子不以汝嘉为不肖俾再守兹土顾治民事神皆守之职惟是夫子之祀教化所基尤宜严且谨而拜跪荐祭之地卑陋乃尔其何以掲防妥灵汝嘉不敢避其责曩常去此弥年若有所负尚安得以罢輭自恕复累后人乎他日或克就绪愿与诸君落之于是谋之僚吏搜故府得遗材千枚取赢资以给其费鸠工庀役各举其任嵗月讫工民不与知像设礼器百用具修至于堂室廊序门牖垣墙皆一新之""")

[{'entity': '0', 'score': 0.99994385, 'index': 1, 'word': '郡', 'start': 0, 'end': 1}, {'entity': '0', 'score': 0.6686995, 'index': 2, 'word': '邑', 'start': 1, 'end': 2}, {'entity': '0', 'score': 0.9999907, 'index': 3, 'word': '置', 'start': 2, 'end': 3}, {'entity': '0', 'score': 0.99999857, 'index': 4, 'word': '夫', 'start': 3, 'end': 4}, {'entity': '0', 'score': 0.99999785, 'index': 5, 'word': '子', 'start': 4, 'end': 5}, {'entity': '0', 'score': 0.9999579, 'index': 6, 'word': '庙', 'start': 5, 'end': 6}, {'entity': '0', 'score': 0.999998, 'index': 7, 'word': '于', 'start': 6, 'end': 7}, {'entity': '0', 'score': 0.531733, 'index': 8, 'word': '学', 'start': 7, 'end': 8}, {'entity': '0', 'score': 0.99997854, 'index': 9, 'word': '以', 'start': 8, 'end': 9}, {'entity': '0', 'score': 0.9999752, 'index': 10, 'word': '嵗', 'start': 9, 'end': 10}, {'entity': '0', 'score': 0.99998474, 'index': 11, 'word': '时', 'start': 10, 'end': 11}, {'entity': '0', 'score': 0.9999968, 'index': 12, 'word': '释', 'star

'郡0邑0置0夫0子0庙0于0学0以0嵗0时0释0奠，盖0自0唐0贞0观0以0来，未0之0或0改，我0宋0有0天0下0因0其0制0而0损0益0之0姑0苏0当0浙0右0要0区，规0模0尤0大，更0建0炎0戎0马，荡0然0无0遗。虽0修0学0宫0于0荆0榛0瓦0砾0之0余，独0殿0宇0未0遑0议0也。每0春0秋0展0礼0于0斋0庐，已0则0置0不0问，殆0为0阙0典。今0寳0文0阁0直0学0士0括0苍0梁0公0来0牧0之，明0年，实0绍0兴0十0有0一0禩0也。二0月，上0丁0修0祀0既0毕，乃0愓0然0自0咎，揖0诸0生0而0告0之0曰"天0子0不0以0汝0嘉0为0不0肖，俾0再0守0兹0土，顾0治0民0事，神0皆0守0之0职0惟0是0夫0子0之0祀，教0化0所0基，尤0宜0严0且0谨0而0拜0跪0荐0祭0之0地，卑0陋0乃0尔，其0何0以0掲0防0妥0灵？汝0嘉0不0敢0避0其0责0曩0常0去0此0弥0年，若0有0所0负，尚0安0得0以0罢0輭0自0恕，复0累0后0人0乎？他0日0或0克0就0绪，愿0与0诸0君0落0之，于0是0谋0之，僚0吏0搜0故0府0得0遗0材0千0枚，取0赢0资0以0给0其0费，鸠0工0庀0役，各0举0其0任0嵗0月0讫，工0民0不0与0知0像0设0礼0器，百0用0具0修，至0于0堂0室0廊0序。门0牖0垣0墙，皆0一0新0之，'

In [None]:
# import pandas as pd
# df = pd.read_csv('test.txt', sep='\t')

# text_list = df['input'].tolist()[0:]
# predicted = []
# for text in text_list:
#     predicted.append(mark_sentence(text))

# # predicted save to csv
# df2 = pd.DataFrame(predicted)
# # output to csv but ignore header
# df2.to_csv('predicted.txt', index=False, header=False)
# df2 = pd.read_csv('predicted.txt', sep='\t', header=None)
# df2.columns = ['predicted']

# punctuation = ['，', '。', '！', '？', '；', '：', '、', '「', '」', '『', '』', '（', '）', '〔', '〕', '【', '】', '《', '》', '〈', '〉', '﹏', '＿', '～', '—', '…', '‥', '﹑', '﹔', '﹖', '﹪', '﹙', '﹚', '﹛', '﹜', '﹟', '﹠', '﹡', '﹢', '﹣', '﹤', '﹥', '﹦', '﹨', '﹩', '﹪', '﹫', '＃', '＄', '％', '＆', '＊', '＋', '－', '／', '＜', '＝', '＞', '＠', '＾', '＿', '｀', '｜', '～', '∕', '∥']

# predicted_parsed = []
# for text in df2['predicted']:
#     i = 0
#     line = []
#     while i < len(text) - 1:
#         if text[i + 1] in punctuation:
#             line.append(1)
#             i += 1
#         else:
#             line.append(0)
#         i += 1
#     if line[-1] != 1:
#         line.append(1)
#     predicted_parsed.append(' - '.join([str(x) for x in line]))

# df3 = pd.DataFrame(predicted_parsed)
# df3.to_csv('predicted_parsed.txt', index=False, header=False)