In [1]:
from tqdm import tqdm
import codecs
import math
import numpy as np
import os
import pandas as pd
import re

print(os.getcwd())

D:\Windows_Storage\Storage\Github\KnowledgeGraph\scripts


# Some Reference

* Regular Expression Tester: https://regex101.com/
* Some Re Tutorials: https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285

# Processing Steps
* Divide paragraphs into sentences by Chinese punctuations (“，”, “。”, “、”, “？”, “?”).
* Segment sentences into tokens with relative dependencies and POSs (Part of Speech).
* Remove tokens with certain type of dependency and POS (i.e. PUNCT, x, PARENTHESISCATEGORY)
* Remove unnecessary tokens
  * punctuation
  * word between two bracket
* Extract possible tokens as candidates of entities and relations by its dependency and POS category,
  * with regular expression (Entity, POS, dependencies): “^[V]”, “^[N]”,
  * with certain dependencies and POS type,
  * with given entity dictionaries (entity, POS, dependencies),
  * with certain conjuction chars (["的", "、", "之", "及", "與"]) ,
* Clean up candidate list again by removing unnecessary tokens (conjunction characters). 
  * 晶圓的 -> 晶圓, 的製程 -> 製程, issues caused by adjectives.
* Remove stopwords
* Remove entity and relation with only one and more than 10 chars
* Caculate TDiDF and sort

<br>

#### Optional
* Conclude frequency of occurrence of each node/edge candidate, keeping the first 20 ranks only.

# Configurations
### Extract possible tokens as candidates of entities and relations by its dependency and POS category

In [2]:
# paths
token_path = "../results/210330_result/210330_dataset_monpaResult.csv"
entity_saving_path = "../results/210408_result/210408_dataset_entity_result_MONPA.csv"
relation_saving_path = "../results/210408_result/210408_dataset_relation_result_MONPA.csv"
main_result_saving_path = "../results/210408_result/210408_dataset_main_result_MONPA.csv"
filtered_result_path = "../results/210408_result/210408_dataset_filtered_result_MONPA.csv"

# token_path = "../results/210330_result/210330_dataset_spaCyResult.csv"
# entity_saving_path = "../results/210330_result/210330_dataset_entity_result_spaCy.csv"
# relation_saving_path = "../results/210330_result/210330_dataset_relation_result_spaCy.csv"
# main_result_saving_path = "../results/210330_result/210330_dataset_main_result_spaCy.csv"
# filtered_result_path = "../results/210408_result/210408_dataset_filtered_result_spaCy.csv"

In [3]:
# MONPA
relation_dependencies_possible_list = []
relation_pos_possible_list = ["VH", "VC", "VJ", "VA"]
relation_pos_re = "^[V]"
entity_dependencies_possible_list = []
entity_pos_possible_list = ["Na", "Nv", "Neu", "Nes", "Nf", "Ng", "Nh", "Neqa", "Nep", "Ncd", "FW", "DE"]
entity_pos_re = "^[N]"
splitter_pos_list = ["COMMACATEGORY", "PERIODCATEGORY"]

# spaCy
# relation_dependencies_possible_list = ["ROOT", "nmod:prep", "prep", "agent", ]
# relation_pos_possible_list = ["VERB"]
# relation_pos_re = "^[V]"
# entity_dependencies_possible_list = ["compound:nn", "nsubj", "dep", "dobj"]
# entity_pos_possible_list = ["NOUN", "PROPN"]
# entity_pos_re = "^[N]"
# punct_pos_list = ["PUNCT"]

# Common, usually entity
sentences_splitter = ["，", "。", "！", "!", "？", "?", "；", ";", "."] + ["\r\n" * i for i in range(0, 100)]
bracket_entity_list_first = ["(", "（", "[", "［", "{", "｛", "<", "＜", "〔", "【", "〖", "《", "〈"]
bracket_entity_list_last = [")", "）", "]", "］", "}", "｝", ">", "＞", "〕", "】", "〗", "》", "〉"]
punct_entity_list = [" " * i for i in range(0, 100)]
conjuction_entity_list = ["的", "、", "之", "及", "與"]
not_entity_relation_list = ["的", "、", "之", "及", "與", "\r\n \r\n ", "\r\n \r\n  "] +\
[" " * i for i in range(0, 100)] + ["\n" * i for i in range(0, 100)]

# Read in Tokens (Preparation beforehand)

In [4]:
# read in tokens

dataToken = pd.read_csv(token_path, encoding="utf8")

# append a column for data if dependencies column does not exist (monpa case)
if len(dataToken.columns) < 3:
    dataToken.loc[:, "Dependecies"] = np.nan

print(len(dataToken))
print(dataToken.head(5))

114798
  Segmented Element                  POS  Dependecies
0            安森美半導體                  NaN          NaN
1                 （  PARENTHESISCATEGORY          NaN
2  ON Semiconductor                   FW          NaN
3                 ）  PARENTHESISCATEGORY          NaN
4                 ，        COMMACATEGORY          NaN


# Divide by Chinese Seperators
# Segment sentences into tokens with relative dependencies and POSs (Part of Speech).
# Remove tokens with certain type of dependency and POS (i.e. PUNCT, x, PARENTHESISCATEGORY)
# Remove unnecessary tokens

In [5]:
# devide token by seperators
total_entity_list = []
sentence_entity_list = []
total_label_list = []
sentence_label_list = []
total_dependencies_list = []
sentence_dependencies_list = []


# devide tokens by chinese punctuations
for index, entityElement in enumerate(dataToken.iloc[:, 0]):
    if entityElement not in sentences_splitter:
        sentence_entity_list.append(entityElement)
        sentence_label_list.append(dataToken.iloc[index, 1])
        sentence_dependencies_list.append(dataToken.iloc[index, 2])
    else:
        total_entity_list.append(sentence_entity_list)
        total_label_list.append(sentence_label_list)
        total_dependencies_list.append(sentence_dependencies_list)
        sentence_entity_list = []
        sentence_label_list = []
        sentence_dependencies_list = []
    
print("list length (Sentences): ", len(total_entity_list), len(total_label_list), len(total_dependencies_list))


# Clean up tokens and eliminate unneccesary tokens
for sentenceIndex, sentences in enumerate(total_entity_list):
    for tokenIndex, tokens in enumerate(sentences):
        # delete elements that's between two bracket
        if sentences[tokenIndex] in bracket_entity_list_first:
            left_bracket_index = bracket_entity_list_first.index(sentences[tokenIndex])
            findingLimit =  (tokenIndex+11) if (tokenIndex+11) <= (len(sentences)) else len(sentences)
            for findingLeftIndex in range(tokenIndex+1, findingLimit):
                if sentences[findingLeftIndex] == bracket_entity_list_last[left_bracket_index]:
                    for removalIndex in range(tokenIndex, findingLeftIndex+1):
#                         print(total_entity_list[sentenceIndex][removalIndex])
                        total_entity_list[sentenceIndex][removalIndex] = ""
                        total_label_list[sentenceIndex][removalIndex]= ""
                        total_dependencies_list[sentenceIndex][removalIndex] = ""
                    break
        elif sentences[tokenIndex] in punct_entity_list or sentences[tokenIndex] in sentences_splitter:
            # set token that fit certain POS type to ""
            total_entity_list[sentenceIndex][tokenIndex] = ""
            
    total_entity_list[sentenceIndex] = list(filter(("").__ne__, total_entity_list[sentenceIndex]))
    total_label_list[sentenceIndex]  = list(filter(("").__ne__, total_label_list[sentenceIndex]))
    total_dependencies_list[sentenceIndex] = list(filter(("").__ne__, total_dependencies_list[sentenceIndex]))
                
print(total_entity_list[0] , total_label_list[0], total_dependencies_list[0])
print("list length: ", len(total_entity_list), len(total_label_list), len(total_dependencies_list))

list length (Sentences):  7865 7865 7865
['安森美半導體'] [nan] [nan]
list length:  7865 7865 7865


# Extract possible tokens as candidates of entities and relations by its dependency and POS category.
#  Clean up candidate list again by removing unnecessary tokens (conjunction characters). 

In [6]:
# get entity pairs and relations

# get relations
all_relations_list = []
all_relations_index = []
all_element_flatten = []
all_relation_flatten = []

for sentenceIndex, sentences in enumerate(total_entity_list):
    relation_list = []
    relation_index_list = []
    for tokenIndex, token in enumerate(sentences):
        if len(sentences) > 1: 
            if total_dependencies_list[sentenceIndex][tokenIndex] in relation_dependencies_possible_list or\
            total_label_list[sentenceIndex][tokenIndex] in relation_pos_possible_list or\
            re.match(relation_pos_re, str(total_label_list[sentenceIndex][tokenIndex]), flags=re.IGNORECASE) != None:
                relation_list.append(token)
                relation_index_list.append(tokenIndex)
    all_relations_list.append(relation_list)
    all_relations_index.append(relation_index_list)


# get entities
all_reformatted_entities = []
all_reformatted_index_list = []
for sentenceIndex, sentences in enumerate(total_entity_list):
    entity_index_list = []
    possible_entities = []
    reformatted_entities = []
    reformatted_index_list = []
    for tokenIndex, token in enumerate(sentences):
        if len(sentences) > 1:
            if total_dependencies_list[sentenceIndex][tokenIndex] in entity_dependencies_possible_list or\
            total_label_list[sentenceIndex][tokenIndex] in entity_pos_possible_list or\
            re.match(entity_pos_re, str(total_label_list[sentenceIndex][tokenIndex]), flags=re.IGNORECASE) != None or\
            token in conjuction_entity_list:
                entity_index_list.append(tokenIndex)
                possible_entities.append(token)
    # combine token if situate next to each other
#     print(entity_index_list, possible_entities)
    if len(possible_entities) > 0:
        combine_entity_name = possible_entities[0]
        for possibleIndex, possibleElement in enumerate(entity_index_list):
#             print(possible_entities)
            
            isContinuous = False
            if possibleIndex != 0:
                if possibleElement == entity_index_list[possibleIndex - 1] + 1:
                    isContinuous = True
                    combine_entity_name += possible_entities[possibleIndex]
                else:
                    isContinuous = False
            
                if isContinuous == False:
                    reformatted_entities.append(combine_entity_name)
                    reformatted_index_list.append(entity_index_list[possibleIndex-1])
                    combine_entity_name = possible_entities[possibleIndex]
                    
            if possibleIndex == (len(entity_index_list) - 1):
                reformatted_entities.append(combine_entity_name)
                reformatted_index_list.append(possibleElement)
                
#             print(combine_entity_name)     
#     print(reformatted_entities)
    all_reformatted_entities.append(reformatted_entities)
    all_reformatted_index_list.append(reformatted_index_list)
    
# flatten section
# remove unneccessary conjuction words
for index, element in enumerate(all_reformatted_entities):
    
    for elementIndex, elementSingle in enumerate(element):
        # remove single conjuction char in first or last position
        if elementSingle[0] in conjuction_entity_list:
            element[elementIndex] = elementSingle[1:]
        elif elementSingle[-1] in conjuction_entity_list:
            element[elementIndex] = elementSingle[:-1]
            
        all_element_flatten.append(element[elementIndex])
        
    for relationIndex, relationSingle in enumerate(all_relations_list[index]):
        # remove single conjuction char in first or last position
        if relationSingle[0] in conjuction_entity_list:
            all_relations_list[index][relationIndex] = relationSingle[1:]
        elif relationSingle[-1] in conjuction_entity_list:
            all_relations_list[index][relationIndex] = relationSingle[:-1]
            
        all_relation_flatten.append(all_relations_list[index][relationIndex])
                      
        
# print(all_relations_list, len(all_relations_list), "\n")
# print(all_reformatted_entities, len(all_reformatted_entities), )
print(len(all_relations_list), len(all_relations_index), len(all_reformatted_entities), len(all_reformatted_index_list))

# print out parsing result
for index, element in enumerate(all_reformatted_entities[:15]):
    print(element, all_reformatted_index_list[index], all_relations_list[index], all_relations_index[index])
    
data_result = pd.DataFrame({
    "Entity":all_reformatted_entities,
    "EntityIndex":all_reformatted_index_list,
    "Relation":all_relations_list,
    "RelationIndex":all_relations_index
})

7865 7865 7865 7865
[] [] [] []
['今天', 'RDM系列矽光電倍增管陣列'] [0, 7] ['發表', '新'] [1, 2]
['光學雷達感測器能力擴展到其', ''] [6, 8] ['廣泛', '智慧感測方案'] [7, 9]
['ArrayRDM-0112A20-QFN', '市場上首款', '車規的SiPM產品'] [0, 5, 10] ['符合'] [6]
['汽車產業及其他領域LiDAR應用中', '需求'] [9, 13] ['滿足', '增長'] [1, 11]
['ArrayRDM-0112A20-QFN', '單片1×12SiPM像素陣列'] [0, 8] [] []
['安森美半導體', '市場的RDM製程'] [1, 5] ['領先'] [2]
['外 光的高'] [8] ['實現', '近', '紅', '靈敏度'] [1, 3, 4, 9]
['905奈米', '業界的18.5％的光子偵測效率'] [3, 14] ['達到', '領先'] [5, 6]
['SiPM', '內部增益', '其靈敏度', '光子水準'] [1, 4, 7, 12] ['高', '使', '達到'] [2, 5, 9]
['功能', 'PDE結合使用'] [2, 6] ['高'] [3]
['', '信號'] [4, 6] ['檢測', '微弱', '返回'] [1, 3, 5]
[] [] [] []
['反射目標'] [4] ['低'] [2]
['距離'] [6] ['偵測到', '遠'] [2, 4]


In [7]:
# conculde dataframe with all tokens
        
for notIndex, notElement in enumerate(not_entity_relation_list):
    all_element_flatten = list(filter((notElement).__ne__, all_element_flatten))
    all_relation_flatten = list(filter((notElement).__ne__, all_relation_flatten))
        
data_entity = pd.DataFrame({
    "Element":all_element_flatten,
})

data_relation = pd.DataFrame({
    "Element":all_relation_flatten,
})

# Remove stopwords
# Remove entity and relation with only one char and too many chars.

In [8]:
# import stopwords list

stopword_list = []

for fileIndex, fileElement in enumerate(os.listdir("./stopwords/")):
    if fileElement[-2:] != "md":
        data_temp = pd.read_csv("./stopwords/" + fileElement, encoding="utf8", sep="@#$%&*")
        stopword_list += data_temp.iloc[:, 0].tolist()
        
stopword_list = list(dict.fromkeys(stopword_list))
        
print("停用詞數目：", len(stopword_list), "\n\n", stopword_list[-50:])     

停用詞數目： 3249 

 ['where', 'whereafter', 'whereas', 'whereby', 'wherein', "where's", 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', "who's", 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', "you'd", "you'll", 'your', "you're", 'yours', 'yourself', 'yourselves', "you've", 'zero', 'zt', 'ZT', 'zz', 'ZZ', '準備', '覆雜', '心裏', '註意', '裏面']


  data_temp = pd.read_csv("./stopwords/" + fileElement, encoding="utf8", sep="@#$%&*")


In [9]:
# remove stop words and entity & relation that only occur once

data_list = [data_entity, data_relation]

for dataIndex, dataElement in enumerate(data_list):
    drop_index = []
    for rowIndex, rowElement in enumerate(dataElement.iloc[:, 0]):
        if rowElement in stopword_list or len(str(rowElement)) <= 1 or len(str(rowElement)) >= 8:
            drop_index.append(rowIndex)
    # drop entity/relations that are inside stopword list
    print("Original Length:", len(dataElement))
    dataElement.drop(index=drop_index, inplace=True)
    print("After Removal:", len(dataElement))
    

Original Length: 18571
After Removal: 12744
Original Length: 14218
After Removal: 8098


# Caculate TDiDF and sort

In [10]:
# conculde a list with frequency

def ratiolize(x):
    x = str(np.around(x * 100, decimals=2)) + "%"
    return x

data_entity_value_count = data_entity.value_counts(ascending=False).to_frame()
data_entity_value_count.reset_index(inplace=True)
data_entity_value_count = data_entity_value_count.rename(columns = {'index':'Entity'})
data_entity_value_count = data_entity_value_count.rename(columns = {0:'Count'})
data_entity_value_count.loc[:, "Ratio"] = np.around(data_entity_value_count.iloc[:, 1] / len(data_entity_value_count),
                                                    decimals=4)
# data_entity_value_count.loc[:, "Ratio"] = data_entity_value_count.loc[:, "Ratio"].apply(ratiolize)

data_relation_value_count = data_relation.value_counts(ascending=False).to_frame()
data_relation_value_count.reset_index(inplace=True)
data_relation_value_count = data_relation_value_count.rename(columns = {'index':'Relation'})
data_relation_value_count = data_relation_value_count.rename(columns = {0:'Count'})
data_relation_value_count.loc[:, "Ratio"] = np.around(data_relation_value_count.iloc[:, 1] / len(data_relation_value_count),
                                                      decimals=4)
# data_relation_value_count.loc[:, "Ratio"] = data_relation_value_count.loc[:, "Ratio"].apply(ratiolize)


'''
calculate tfidf
'''
documentNum = len(os.listdir("../data/"))
data_all = [data_entity_value_count, data_relation_value_count]

# gather all document
textDocuments = []
for fileIndex, fileElement in enumerate(os.listdir("../data/")):
    textElement = ""
    file = codecs.open("../data/" + fileElement, 'r', encoding='utf8', errors='ignore')
    for textIndex, textLines in enumerate(file):
        textElement += textLines
    textDocuments.append(textElement)


for dataIndex, dataElement in enumerate(data_all):
    for rowIndex, rowElement in enumerate(tqdm(dataElement.iloc[:, 0])):
        document_count = 0
        for textIndex, textElement in enumerate(textDocuments):
            # check if entity/relation exists in the doucument
            if rowElement in textElement:
                document_count += 1
        # assign document frequency
        dataElement.loc[rowIndex, "DocumentFrequency"] = document_count
        # assign tfidf
        dataElement.loc[rowIndex, "tfiDF"] = np.around(dataElement.loc[rowIndex, "Count"] * \
            math.log(documentNum / (dataElement.loc[rowIndex, "DocumentFrequency"] + 1), 10), decimals=4)
        
# data_entity_value_count.sort_values(by=["tfiDF"], ascending=False, inplace=True)
# data_relation_value_count.sort_values(by=["tfiDF"], ascending=False, inplace=True)

print(len(data_entity), data_entity_value_count[:10], "\n")
print(len(data_relation), data_relation_value_count[:10])

100%|██████████| 8925/8925 [00:06<00:00, 1462.40it/s]
100%|██████████| 2809/2809 [00:01<00:00, 1550.19it/s]

12744   Element  Count   Ratio  DocumentFrequency    tfiDF
0      晶圓     82  0.0092               14.0   0.0000
1      技術     68  0.0076               14.0   0.0000
2      製程     61  0.0068               15.0  -1.7098
3      晶片     53  0.0059               13.0   1.5881
4      產品     48  0.0054               13.0   1.4382
5      設備     37  0.0041               11.0   3.5857
6      表面     37  0.0041               12.0   2.2995
7      資源     35  0.0039                4.0  16.6992
8      成本     35  0.0039               10.0   4.7145
9      測品     32  0.0036                2.0  22.3670 

8098   Element  Count   Ratio  DocumentFrequency    tfiDF
0      回收    131  0.0466                4.0  62.5029
1      利用    108  0.0384                5.0  42.9775
2      製造    106  0.0377               14.0   0.0000
3      進行     90  0.0320                5.0  35.8146
4      包括     86  0.0306               11.0   8.3343
5      提供     68  0.0242               11.0   6.5899
6      萃取     64  0.0228         




In [11]:
# Saving result without filtering
data_entity_value_count.to_csv(entity_saving_path, encoding="utf8", index=None, quoting=False)
data_relation_value_count.to_csv(relation_saving_path, encoding="utf8", index=None, quoting=False)
data_result.to_csv(main_result_saving_path, encoding="utf8", index=None, quoting=False)

# Filter out most frequent edge and relation

In [12]:
data_entity_value_count = pd.read_csv(entity_saving_path, encoding="utf8")
data_relation_value_count = pd.read_csv(relation_saving_path, encoding="utf8")
data_result = pd.read_csv(main_result_saving_path, encoding="utf8")

# take only first 20 places
data_entity_filter = data_entity_value_count.loc[:, "Element"].tolist()[:20]
data_relation_filter = data_relation_value_count.loc[:, "Element"].tolist()[:20]

abandonIndexList = []

for rowIndex, rowItem in enumerate(data_result.iloc[:, 0]):
    need_to_abandon = True
    entityList = rowItem.replace("'", "").replace(" ", "").replace("[", "").replace("]", "").split(",")[:-1]
    relationList = data_result.iloc[rowIndex, 2].replace("'", "").replace(" ", "").replace("[", "").replace("]", "").split(",")[:-1]
    
    for entityIndex, entityElement in enumerate(entityList):
        if entityElement in data_entity_filter:
            need_to_abandon = False
            break
    
    for relationIndex, relationElement in enumerate(relationList):
        if need_to_abandon == False:
            break
        if relationElement in data_relation_filter:
            need_to_abandon = False
            break
            
    if need_to_abandon == True:
        abandonIndexList.append(rowIndex)
            
data_result.drop(index=abandonIndexList, inplace=True)
print(data_result)

# export data
data_result.to_csv(filtered_result_path, encoding="utf8", index=None)

                                                 Entity  \
20             ['SiPM', '陽光條件下', '距離', '時', '訊噪', '性能']   
21                          ['其他優勢', '電源偏置', '溫度變化敏感性']   
29                                         ['其他感知模式互補']   
37    ['安森美半導體汽車感測部門', '總監', '', '解析度深度數據', '挑戰的微光條件...   
39                                  ['距離、高性價比的LiDAR方案']   
...                                                 ...   
7849                                  [' 壞去', '達', '%']   
7851           ['凝器置於', '控制設備(', '設備', '碳吸 附床及吸收', '後']   
7852                         ['冷凝處理', '廢氣成分中', '溫度之不同']   
7853                 ['冷凝作用', '兩種方式在定溫', '增加系統之壓力2在定壓']   
7856                            ['冷凝器型式', '表面式及接觸式冷凝器']   

                 EntityIndex                             Relation  \
20    [0, 7, 10, 13, 18, 20]    ['明亮', '進行', '長', '測', '提供', '佳']   
21                [1, 7, 14]                     ['包括', '低', '低']   
29                       [5]                   ['通過', '提供', '冗餘']   
37    [3, 5, 11