In [1]:
import os
import re
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, ClassLabel

from langchain.document_loaders import PyPDFLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from neo4j import GraphDatabase
from utils import read_json

import torch

In [2]:
def get_whole_pdf(file_data):
    whole_pdf = ""
    for i in range(len(file_data)):
        whole_pdf += file_data[i].page_content

    return whole_pdf

In [29]:
configs = read_json('configs.json')
DATA_PATH = configs["DATA_PATH"]
DB_PATH = configs['DB_PATH']
files_list = [file for file in os.listdir(DATA_PATH) if os.path.isfile(os.path.join(DATA_PATH, file))]

def read_pdf(files_list, data_path=DATA_PATH):
    content_dict = {}

    for file_name in files_list:
        file_path = data_path + file_name
        loader = PyPDFLoader(file_path)
        content = loader.load()
        whole_pdf = ""
        for i in range(len(content)):
            whole_pdf += content[i].page_content

        whole_pdf = re.sub(r'[：。\n]', '', whole_pdf)
        content_dict[file_name] = whole_pdf

    return content_dict

In [30]:
files_list

['34944_高鐵_促參.pdf',
 '38005_核四_政策.pdf',
 '39243_核四_品質.pdf',
 '39477_核四_料件.pdf',
 '46365_高鐵_基金.pdf',
 '48052_大客車_逃生門.pdf',
 '48746_大客車_安全門.pdf',
 '49490_核四_延宕.pdf',
 '49627_核四_停建.pdf',
 '49676_大客車_駕照.pdf',
 '52922_大客車_超時.pdf',
 '54210_高鐵_出資.pdf',
 '54376_高鐵_航發.pdf',
 '54561_高鐵_機電.pdf',
 '58930_核四_封存.pdf']

In [31]:
pdf_contents = read_pdf(files_list)

In [25]:
def content_extraction(input, pattern):

    # Search using the pattern
    match = re.search(pattern, input, re.S)

    if match:
        result = match.group(1).strip()  # Use strip() to remove any leading/trailing whitespace
        print("Extraction succeed.")
        return result
    else:
        print("No match found")
        pass

In [26]:
pattern = r"貳、案   由(.*?)參、事實與理由"

extracted_content = {}
for filename, content in pdf_contents.items():
    extracted_content[filename] = content_extraction(content, pattern)

No match found
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.


In [8]:
extracted_content

{'23382_高鐵_振動.pdf': None,
 '34944_高鐵_促參.pdf': '交通部於 87年間與臺灣高速鐵路公司簽訂「臺灣南北高速鐵路興建營運合約」及「臺灣南北高速鐵路站區開發合約 」，疏未預先審度該公司展延高鐵通車營運時程及遲延受領站區用地所生 損失之責任歸屬，並綢繆約定 相關處罰條款 ，嗣對政府權益保障恝置不察，一再同意該公司展延通車營運時程及受領站區用地， 致政府蒙受回饋金與租金收入減少及顧問費用增加等鉅額損失，均有違失，爰依監察法第 24條規定提案糾正',
 '38005_核四_政策.pdf': '核四封存後每年仍耗費數億元於資產維護管理，行政院及經濟部對外宣告核四重啟不可行，核四興建費用 2,833億元頇列為損失，行政院及 經濟部對核四政策之重大變動，導致資源嚴重浪費； 再者，經濟部宣布能源配比 (燃氣50%、燃煤30%、再生能源20%)之能源轉型政策，未經能源安全、能源經濟及環境影響等完整評估 ，復於再生能源發電量增加有限情況下， 以運轉中核電機組長期停機 方式減核 ，致近年火力發購電量逐年提高， 106年占比達84.4%，燃煤發電增幅 甚至高於燃氣，造成嚴重空氣污染；以及經濟部宣布 新能源政策 之前，並未評估其對電價之影響，迄 106年3月行政院始於國公營企業體檢小組會議評估 等情，均有違失，爰依法提案糾正',
 '39243_核四_品質.pdf': '台電公司 未落實「核四工程品質保證方案」，致龍門電廠試運轉時違規與注意改善事項層出不窮，如 抑壓池灌水作業 不當，致反應器廠房底層淹水 、壓力試驗合格之室內消防栓 系統，其 太平龍頭 竟脫落，致汽機廠房積水等 ，均嚴重衝擊國人對核能安全運轉之信心等情 ，確有諸多違失，爰依法提案糾正',
 '39477_核四_料件.pdf': '台灣電力股份有限公司 (下稱台電公司 )於本院調查核四廠一號機因施工測詴期間設備損壞而移用二號機相關設備之過程，所提供資料內容前後不一，設備組件損壞、採購及修復個數未能確實清查正確，顯見台電公司核四廠之料件管理系統紊亂，且回復本院公文一再發生資料正確性不足，核有怠失；另台電公司於 81年陳報核四興建計畫， 竟以69年所估算成本陳報， 未能如實報告核四建廠成本，致使政府無法確實評估該項投資計畫之成本效益，台電公司表示當時即考慮日後再以追加預算方式提出，此亦導致政 

In [9]:
import re

def extract_label(filename):
    # Assuming the label is always after the first underscore and ends before the second underscore
    match = re.search(r'_(.*?)_', filename)
    if match:
        return match.group(1)  # This extracts text between the first pair of underscores
    return None  # In case the pattern does not match

In [10]:
formatted_data  = [{'label': extract_label(filename), 'text' : content } for filename, content in extracted_content.items()]
formatted_data 

[{'label': '高鐵', 'text': None},
 {'label': '高鐵',
  'text': '交通部於 87年間與臺灣高速鐵路公司簽訂「臺灣南北高速鐵路興建營運合約」及「臺灣南北高速鐵路站區開發合約 」，疏未預先審度該公司展延高鐵通車營運時程及遲延受領站區用地所生 損失之責任歸屬，並綢繆約定 相關處罰條款 ，嗣對政府權益保障恝置不察，一再同意該公司展延通車營運時程及受領站區用地， 致政府蒙受回饋金與租金收入減少及顧問費用增加等鉅額損失，均有違失，爰依監察法第 24條規定提案糾正'},
 {'label': '核四',
  'text': '核四封存後每年仍耗費數億元於資產維護管理，行政院及經濟部對外宣告核四重啟不可行，核四興建費用 2,833億元頇列為損失，行政院及 經濟部對核四政策之重大變動，導致資源嚴重浪費； 再者，經濟部宣布能源配比 (燃氣50%、燃煤30%、再生能源20%)之能源轉型政策，未經能源安全、能源經濟及環境影響等完整評估 ，復於再生能源發電量增加有限情況下， 以運轉中核電機組長期停機 方式減核 ，致近年火力發購電量逐年提高， 106年占比達84.4%，燃煤發電增幅 甚至高於燃氣，造成嚴重空氣污染；以及經濟部宣布 新能源政策 之前，並未評估其對電價之影響，迄 106年3月行政院始於國公營企業體檢小組會議評估 等情，均有違失，爰依法提案糾正'},
 {'label': '核四',
  'text': '台電公司 未落實「核四工程品質保證方案」，致龍門電廠試運轉時違規與注意改善事項層出不窮，如 抑壓池灌水作業 不當，致反應器廠房底層淹水 、壓力試驗合格之室內消防栓 系統，其 太平龍頭 竟脫落，致汽機廠房積水等 ，均嚴重衝擊國人對核能安全運轉之信心等情 ，確有諸多違失，爰依法提案糾正'},
 {'label': '核四',
  'text': '台灣電力股份有限公司 (下稱台電公司 )於本院調查核四廠一號機因施工測詴期間設備損壞而移用二號機相關設備之過程，所提供資料內容前後不一，設備組件損壞、採購及修復個數未能確實清查正確，顯見台電公司核四廠之料件管理系統紊亂，且回復本院公文一再發生資料正確性不足，核有怠失；另台電公司於 81年陳報核四興建計畫， 竟以69年所估算成本陳報， 未能如實報告核四建廠成本，致使政府無法確實評估該項投資計

In [11]:
label = []
for i in range(len(formatted_data)):
    current_label = formatted_data[i]['label']
    if current_label not in label:
        label.append(current_label)
    else:
        continue
label_feature = ClassLabel(names=label)
label_feature

ClassLabel(names=['高鐵', '核四', '大客車'], id=None)

In [12]:
dataset = Dataset.from_dict({
    'text': [item['text'] for item in formatted_data],
    'label': [item['label'] for item in formatted_data]
})


In [13]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 16
})

In [14]:
def preprocess_label(example):
    
    example['label'] = [label_feature.str2int(label) for label in example['label']]
    return example

In [15]:
dataset_processed = dataset.map(preprocess_label, batched=True)
dataset_split = dataset_processed.train_test_split(test_size=0.3)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [16]:
dataset_split

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
})

## sentence-transformers/all-MiniLM-L6-v2

In [17]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
model_ckpt = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [17]:
sentences = ['前天是禮拜二', '昨天是禮拜二']
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
encoded_input

NameError: name 'tokenizer' is not defined

In [20]:
with torch.inference_mode():
    model_output = model(**encoded_input)

In [21]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [22]:
# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

In [23]:
cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))

array([[0.92570424]], dtype=float32)

In [24]:
tokenized_dict = {}
for filename, content in extracted_content.items():
    encoded_content = tokenizer(content, padding=True, truncation=True, return_tensors='pt')
    with torch.inference_mode():
        model_output = model(**encoded_content)
    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_content['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    tokenized_dict[filename] = sentence_embeddings

In [25]:
similarity_df = pd.DataFrame(index=tokenized_dict.keys(), columns=tokenized_dict.keys())

In [26]:
# Create an empty DataFrame
similarity_df = pd.DataFrame(index=tokenized_dict.keys(), columns=tokenized_dict.keys())

# Calculate cosine similarity between each pair of vectors
for name1, vec1 in tokenized_dict.items():
    for name2, vec2 in tokenized_dict.items():
        similarity_df.loc[name1, name2] = cosine_similarity(vec1, vec2)[0][0]

In [27]:
similarity_df

Unnamed: 0,34944_高鐵_促參.pdf,38005_核四_政策.pdf,39243_核四_品質.pdf,39477_核四_料件.pdf,46365_高鐵_基金.pdf,48052_大客車_逃生門.pdf,48746_大客車_安全門.pdf,49490_核四_延宕.pdf,49627_核四_停建.pdf,49676_大客車_駕照.pdf,52922_大客車_超時.pdf,54210_高鐵_出資.pdf,54376_高鐵_航發.pdf,54561_高鐵_機電.pdf,58930_核四_封存.pdf
34944_高鐵_促參.pdf,1.0,0.771646,0.715157,0.810293,0.757769,0.724102,0.737007,0.831104,0.843057,0.820516,0.881772,0.808,0.81963,0.696251,0.693054
38005_核四_政策.pdf,0.771646,1.0,0.5546,0.769547,0.676038,0.573962,0.77681,0.841877,0.869432,0.682205,0.759324,0.786793,0.833689,0.545257,0.829629
39243_核四_品質.pdf,0.715157,0.5546,1.0,0.638359,0.575538,0.738709,0.514832,0.668872,0.666733,0.613016,0.701931,0.53878,0.608017,0.680304,0.52084
39477_核四_料件.pdf,0.810293,0.769547,0.638359,1.0,0.569365,0.558339,0.700211,0.87964,0.864525,0.762747,0.870343,0.848424,0.852757,0.476027,0.704625
46365_高鐵_基金.pdf,0.757769,0.676038,0.575538,0.569365,1.0,0.562911,0.567731,0.649495,0.647114,0.580461,0.615131,0.614591,0.653924,0.599685,0.605182
48052_大客車_逃生門.pdf,0.724102,0.573962,0.738709,0.558339,0.562911,1.0,0.66724,0.631306,0.607808,0.745443,0.694181,0.489822,0.543012,0.774908,0.499585
48746_大客車_安全門.pdf,0.737007,0.77681,0.514832,0.700211,0.567731,0.66724,1.0,0.783596,0.730051,0.839712,0.767071,0.732613,0.744173,0.559655,0.742649
49490_核四_延宕.pdf,0.831104,0.841877,0.668872,0.87964,0.649495,0.631306,0.783596,1.0,0.913284,0.808667,0.870392,0.842519,0.865204,0.556715,0.786616
49627_核四_停建.pdf,0.843057,0.869432,0.666733,0.864525,0.647114,0.607808,0.730051,0.913284,1.0,0.740321,0.842977,0.853902,0.885557,0.599092,0.808137
49676_大客車_駕照.pdf,0.820516,0.682205,0.613016,0.762747,0.580461,0.745443,0.839712,0.808667,0.740321,1.0,0.841511,0.707834,0.727746,0.638968,0.687937


## distilbert-base-multilingual-cased

In [18]:
# from transformers import AutoModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer


In [19]:
model_ckpt = "distilbert-base-multilingual-cased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModel.from_pretrained(model_ckpt).to(device)
tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)
model = DistilBertForSequenceClassification.from_pretrained(model_ckpt, num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [21]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [23]:
dataset_split

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
})

In [22]:
dataset_encoded = dataset_split.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [72]:
dataset_encoded.set_format("torch", 
                            columns=["input_ids",  "label","attention_mask"])

In [73]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5
    })
})

In [74]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    label = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(label, preds, average="weighted")
    acc = accuracy_score(label, preds)
    return {"accuracy": acc, "f1": f1}

In [75]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5
    })
})

In [76]:
batch_size = 1
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=20,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False, 
                                  log_level="error")

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["test"],
                  tokenizer=tokenizer)

# Now you can train the model
trainer.train()


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 1.1103, 'grad_norm': 8.735570907592773, 'learning_rate': 1.9e-05, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.0666906833648682, 'eval_accuracy': 0.4, 'eval_f1': 0.22857142857142856, 'eval_runtime': 1.3192, 'eval_samples_per_second': 3.79, 'eval_steps_per_second': 3.79, 'epoch': 1.0}
{'loss': 1.1276, 'grad_norm': 8.09397029876709, 'learning_rate': 1.8e-05, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.074120283126831, 'eval_accuracy': 0.4, 'eval_f1': 0.22857142857142856, 'eval_runtime': 1.6209, 'eval_samples_per_second': 3.085, 'eval_steps_per_second': 3.085, 'epoch': 2.0}
{'loss': 1.1235, 'grad_norm': 7.93947172164917, 'learning_rate': 1.7e-05, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.064603567123413, 'eval_accuracy': 0.4, 'eval_f1': 0.22857142857142856, 'eval_runtime': 1.5624, 'eval_samples_per_second': 3.2, 'eval_steps_per_second': 3.2, 'epoch': 3.0}
{'loss': 1.0609, 'grad_norm': 6.224499702453613, 'learning_rate': 1.6000000000000003e-05, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.0581352710723877, 'eval_accuracy': 0.4, 'eval_f1': 0.22857142857142856, 'eval_runtime': 1.5601, 'eval_samples_per_second': 3.205, 'eval_steps_per_second': 3.205, 'epoch': 4.0}
{'loss': 1.0832, 'grad_norm': 8.894120216369629, 'learning_rate': 1.5000000000000002e-05, 'epoch': 5.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.017627477645874, 'eval_accuracy': 0.4, 'eval_f1': 0.22857142857142856, 'eval_runtime': 1.6181, 'eval_samples_per_second': 3.09, 'eval_steps_per_second': 3.09, 'epoch': 5.0}
{'loss': 0.9636, 'grad_norm': 6.795644760131836, 'learning_rate': 1.4e-05, 'epoch': 6.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.940122127532959, 'eval_accuracy': 0.4, 'eval_f1': 0.22857142857142856, 'eval_runtime': 1.5618, 'eval_samples_per_second': 3.201, 'eval_steps_per_second': 3.201, 'epoch': 6.0}
{'loss': 0.8234, 'grad_norm': 9.339310646057129, 'learning_rate': 1.3000000000000001e-05, 'epoch': 7.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.811183750629425, 'eval_accuracy': 0.8, 'eval_f1': 0.7866666666666667, 'eval_runtime': 1.5502, 'eval_samples_per_second': 3.225, 'eval_steps_per_second': 3.225, 'epoch': 7.0}
{'loss': 0.6604, 'grad_norm': 4.71323823928833, 'learning_rate': 1.2e-05, 'epoch': 8.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.6383451223373413, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.5553, 'eval_samples_per_second': 3.215, 'eval_steps_per_second': 3.215, 'epoch': 8.0}
{'loss': 0.5119, 'grad_norm': 9.498541831970215, 'learning_rate': 1.1000000000000001e-05, 'epoch': 9.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.5073884129524231, 'eval_accuracy': 0.8, 'eval_f1': 0.7866666666666667, 'eval_runtime': 1.5824, 'eval_samples_per_second': 3.16, 'eval_steps_per_second': 3.16, 'epoch': 9.0}
{'loss': 0.3568, 'grad_norm': 7.109187126159668, 'learning_rate': 1e-05, 'epoch': 10.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.36598625779151917, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 2.1129, 'eval_samples_per_second': 2.366, 'eval_steps_per_second': 2.366, 'epoch': 10.0}
{'loss': 0.239, 'grad_norm': 9.488040924072266, 'learning_rate': 9e-06, 'epoch': 11.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3182787299156189, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.6265, 'eval_samples_per_second': 3.074, 'eval_steps_per_second': 3.074, 'epoch': 11.0}
{'loss': 0.1794, 'grad_norm': 1.4582124948501587, 'learning_rate': 8.000000000000001e-06, 'epoch': 12.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.17570318281650543, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.572, 'eval_samples_per_second': 3.181, 'eval_steps_per_second': 3.181, 'epoch': 12.0}
{'loss': 0.136, 'grad_norm': 3.279721975326538, 'learning_rate': 7e-06, 'epoch': 13.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.14755988121032715, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.5824, 'eval_samples_per_second': 3.16, 'eval_steps_per_second': 3.16, 'epoch': 13.0}
{'loss': 0.1012, 'grad_norm': 0.5643743872642517, 'learning_rate': 6e-06, 'epoch': 14.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.11887383460998535, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 2.0641, 'eval_samples_per_second': 2.422, 'eval_steps_per_second': 2.422, 'epoch': 14.0}
{'loss': 0.0778, 'grad_norm': 2.0376741886138916, 'learning_rate': 5e-06, 'epoch': 15.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.09653078019618988, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.6067, 'eval_samples_per_second': 3.112, 'eval_steps_per_second': 3.112, 'epoch': 15.0}
{'loss': 0.0729, 'grad_norm': 2.0203943252563477, 'learning_rate': 4.000000000000001e-06, 'epoch': 16.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.08449219167232513, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.6035, 'eval_samples_per_second': 3.118, 'eval_steps_per_second': 3.118, 'epoch': 16.0}
{'loss': 0.0552, 'grad_norm': 1.6369279623031616, 'learning_rate': 3e-06, 'epoch': 17.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.07239826768636703, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.5875, 'eval_samples_per_second': 3.15, 'eval_steps_per_second': 3.15, 'epoch': 17.0}
{'loss': 0.0519, 'grad_norm': 1.3551231622695923, 'learning_rate': 2.0000000000000003e-06, 'epoch': 18.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.06682656705379486, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.5691, 'eval_samples_per_second': 3.187, 'eval_steps_per_second': 3.187, 'epoch': 18.0}
{'loss': 0.0472, 'grad_norm': 0.390796959400177, 'learning_rate': 1.0000000000000002e-06, 'epoch': 19.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.06221116706728935, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.535, 'eval_samples_per_second': 3.257, 'eval_steps_per_second': 3.257, 'epoch': 19.0}
{'loss': 0.0552, 'grad_norm': 0.6928702592849731, 'learning_rate': 0.0, 'epoch': 20.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.06124318763613701, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.5904, 'eval_samples_per_second': 3.144, 'eval_steps_per_second': 3.144, 'epoch': 20.0}
{'train_runtime': 300.269, 'train_samples_per_second': 0.666, 'train_steps_per_second': 0.666, 'train_loss': 0.49187826544046404, 'epoch': 20.0}


TrainOutput(global_step=200, training_loss=0.49187826544046404, metrics={'train_runtime': 300.269, 'train_samples_per_second': 0.666, 'train_steps_per_second': 0.666, 'total_flos': 17593640136000.0, 'train_loss': 0.49187826544046404, 'epoch': 20.0})

In [79]:
trainer.save_model("./models/DistilBertForSequenceClassification_finetune")
tokenizer.save_pretrained("./models/DistilBertForSequenceClassification_finetune")

('./models/DistilBertForSequenceClassification_finetune\\tokenizer_config.json',
 './models/DistilBertForSequenceClassification_finetune\\special_tokens_map.json',
 './models/DistilBertForSequenceClassification_finetune\\vocab.txt',
 './models/DistilBertForSequenceClassification_finetune\\added_tokens.json')

In [80]:
# Load the model and tokenizer for classification
model_name = "./models/DistilBertForSequenceClassification_finetune"
model = DistilBertForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [81]:
def get_embeddings(texts, model, tokenizer):
    # Tokenize the input texts
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    # Pass inputs through the model to get hidden states
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    # Get the embeddings from the last hidden state
    hidden_states = outputs.hidden_states[-1]
    # Average the hidden states to get a single vector representation for each text
    embeddings = hidden_states.mean(dim=1)
    return embeddings


In [82]:
# Extract texts from the dictionary
texts = list(extracted_content.values())

# Get embeddings for all documents
embeddings = get_embeddings(texts, model, tokenizer)

# Compute cosine similarity between all pairs of embeddings
cosine_similarities = cosine_similarity(embeddings)

# Convert cosine similarity matrix to a DataFrame
doc_keys = list(extracted_content.keys())
cosine_sim_df = pd.DataFrame(cosine_similarities, index=doc_keys, columns=doc_keys)


In [84]:
cosine_sim_df

Unnamed: 0,34944_高鐵_促參.pdf,38005_核四_政策.pdf,39243_核四_品質.pdf,39477_核四_料件.pdf,46365_高鐵_基金.pdf,48052_大客車_逃生門.pdf,48746_大客車_安全門.pdf,49490_核四_延宕.pdf,49627_核四_停建.pdf,49676_大客車_駕照.pdf,52922_大客車_超時.pdf,54210_高鐵_出資.pdf,54376_高鐵_航發.pdf,54561_高鐵_機電.pdf,58930_核四_封存.pdf
34944_高鐵_促參.pdf,1.0,0.417318,0.307828,0.29811,0.904093,0.42061,0.391825,0.355231,0.314579,0.470008,0.400194,0.84187,0.917164,0.920596,0.302345
38005_核四_政策.pdf,0.417318,1.0,0.930923,0.934776,0.378236,0.333038,0.340692,0.934884,0.93514,0.323404,0.319154,0.62576,0.429262,0.387953,0.947258
39243_核四_品質.pdf,0.307828,0.930923,1.0,0.970819,0.266039,0.297337,0.296697,0.96385,0.964964,0.265987,0.279806,0.515793,0.310987,0.291609,0.959369
39477_核四_料件.pdf,0.29811,0.934776,0.970819,1.0,0.25055,0.255173,0.261974,0.973752,0.990394,0.236424,0.245618,0.511841,0.312808,0.280154,0.980898
46365_高鐵_基金.pdf,0.904093,0.378236,0.266039,0.25055,1.0,0.276737,0.240118,0.304128,0.265703,0.302538,0.247362,0.756903,0.940067,0.927004,0.268291
48052_大客車_逃生門.pdf,0.42061,0.333038,0.297337,0.255173,0.276737,1.0,0.957067,0.299197,0.263307,0.946254,0.959144,0.499668,0.286807,0.311274,0.25403
48746_大客車_安全門.pdf,0.391825,0.340692,0.296697,0.261974,0.240118,0.957067,1.0,0.310364,0.272421,0.956717,0.973044,0.511702,0.262522,0.269896,0.259872
49490_核四_延宕.pdf,0.355231,0.934884,0.96385,0.973752,0.304128,0.299197,0.310364,1.0,0.983293,0.288714,0.292697,0.576746,0.363071,0.322874,0.959255
49627_核四_停建.pdf,0.314579,0.93514,0.964964,0.990394,0.265703,0.263307,0.272421,0.983293,1.0,0.248927,0.255632,0.525015,0.330533,0.293379,0.981257
49676_大客車_駕照.pdf,0.470008,0.323404,0.265987,0.236424,0.302538,0.946254,0.956717,0.288714,0.248927,1.0,0.96347,0.548796,0.333955,0.331156,0.233175


In [90]:
cosine_sim_df['52922_大客車_超時.pdf'].sort_values(ascending=False)

52922_大客車_超時.pdf     0.999999
48746_大客車_安全門.pdf    0.973044
49676_大客車_駕照.pdf     0.963470
48052_大客車_逃生門.pdf    0.959144
54210_高鐵_出資.pdf      0.500650
34944_高鐵_促參.pdf      0.400194
38005_核四_政策.pdf      0.319154
49490_核四_延宕.pdf      0.292697
39243_核四_品質.pdf      0.279806
54561_高鐵_機電.pdf      0.279084
54376_高鐵_航發.pdf      0.263396
49627_核四_停建.pdf      0.255632
46365_高鐵_基金.pdf      0.247362
39477_核四_料件.pdf      0.245618
58930_核四_封存.pdf      0.242160
Name: 52922_大客車_超時.pdf, dtype: float32

In [88]:
cosine_sim_df.to_csv(f'cos_similarity_{model_ckpt}.csv', encoding='big5')

## paraphrase-MiniLM-L6-v2

In [49]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = list(extracted_content.values())

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-0.2189,  0.8674,  0.4606,  ..., -0.6580, -0.4033,  0.6827],
        [ 0.0677,  0.7340,  0.7409,  ..., -0.4495, -0.2585,  0.5965],
        [-0.5176,  0.7155,  0.3799,  ..., -0.7914, -0.3436,  0.7034],
        ...,
        [-0.0959,  0.8173,  0.6737,  ..., -0.6921, -0.6189,  0.6678],
        [-0.2214,  0.6630,  0.3330,  ..., -0.8624,  0.2184,  0.6604],
        [ 0.0169,  0.9363,  0.4728,  ..., -0.5107, -0.2639,  0.6777]])


In [None]:
cosine_similarities = cosine_similarity(sentence_embeddings)

In [None]:
doc_keys = list(extracted_content.keys())
cosine_sim_df = pd.DataFrame(cosine_similarities, index=doc_keys, columns=doc_keys)

In [None]:
cosine_sim_df['48052_大客車_逃生門.pdf'].sort_values(ascending=False)

48052_大客車_逃生門.pdf    1.000000
39243_核四_品質.pdf      0.949153
54561_高鐵_機電.pdf      0.939176
46365_高鐵_基金.pdf      0.906380
34944_高鐵_促參.pdf      0.888729
52922_大客車_超時.pdf     0.878784
39477_核四_料件.pdf      0.874396
49676_大客車_駕照.pdf     0.863293
54376_高鐵_航發.pdf      0.849896
49627_核四_停建.pdf      0.848142
54210_高鐵_出資.pdf      0.836646
49490_核四_延宕.pdf      0.791983
58930_核四_封存.pdf      0.791870
38005_核四_政策.pdf      0.789064
48746_大客車_安全門.pdf    0.784512
Name: 48052_大客車_逃生門.pdf, dtype: float32

In [None]:
extracted_content

{'34944_高鐵_促參.pdf': '交通部於 87年間與臺灣高速鐵路公司簽訂「臺灣南北高速鐵路興建營運合約」及「臺灣南北高速鐵路站區開發合約 」，疏未預先審度該公司展延高鐵通車營運時程及遲延受領站區用地所生 損失之責任歸屬，並綢繆約定 相關處罰條款 ，嗣對政府權益保障恝置不察，一再同意該公司展延通車營運時程及受領站區用地， 致政府蒙受回饋金與租金收入減少及顧問費用增加等鉅額損失，均有違失，爰依監察法第 24條規定提案糾正',
 '38005_核四_政策.pdf': '核四封存後每年仍耗費數億元於資產維護管理，行政院及經濟部對外宣告核四重啟不可行，核四興建費用 2,833億元頇列為損失，行政院及 經濟部對核四政策之重大變動，導致資源嚴重浪費； 再者，經濟部宣布能源配比 (燃氣50%、燃煤30%、再生能源20%)之能源轉型政策，未經能源安全、能源經濟及環境影響等完整評估 ，復於再生能源發電量增加有限情況下， 以運轉中核電機組長期停機 方式減核 ，致近年火力發購電量逐年提高， 106年占比達84.4%，燃煤發電增幅 甚至高於燃氣，造成嚴重空氣污染；以及經濟部宣布 新能源政策 之前，並未評估其對電價之影響，迄 106年3月行政院始於國公營企業體檢小組會議評估 等情，均有違失，爰依法提案糾正',
 '39243_核四_品質.pdf': '台電公司 未落實「核四工程品質保證方案」，致龍門電廠試運轉時違規與注意改善事項層出不窮，如 抑壓池灌水作業 不當，致反應器廠房底層淹水 、壓力試驗合格之室內消防栓 系統，其 太平龍頭 竟脫落，致汽機廠房積水等 ，均嚴重衝擊國人對核能安全運轉之信心等情 ，確有諸多違失，爰依法提案糾正',
 '39477_核四_料件.pdf': '台灣電力股份有限公司 (下稱台電公司 )於本院調查核四廠一號機因施工測詴期間設備損壞而移用二號機相關設備之過程，所提供資料內容前後不一，設備組件損壞、採購及修復個數未能確實清查正確，顯見台電公司核四廠之料件管理系統紊亂，且回復本院公文一再發生資料正確性不足，核有怠失；另台電公司於 81年陳報核四興建計畫， 竟以69年所估算成本陳報， 未能如實報告核四建廠成本，致使政府無法確實評估該項投資計畫之成本效益，台電公司表示當時即考慮日後再以追加預算方式提出，此亦導致政 府長年來不得不對核四預算持續加碼，形成台電公司及國家