In [13]:
from nltk.chunk import RegexpParser
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
import nltk
import re
import json

In [9]:
# path list
wordlist_path1 = '../../data/wordlist/asset_classes.txt'
wordlist_path2 = '../../data/wordlist/economic_indicators.txt'
news_data_path = '../../data/annotated/nlp_id_result.csv'

In [14]:
news_data = pd.read_csv(news_data_path, index_col=0)

In [15]:
na_indices = news_data[news_data.isna().any(axis=1)].index
print(na_indices)

Index([1491, 2443], dtype='int64')


In [20]:
news_data = news_data.dropna()

In [347]:
news_data

Unnamed: 0,sentence,pos_tag
0,Tantangan Ketimpangan Ekonomi,"[('Tantangan', np.str_('NN')), ('Ketimpangan',..."
1,Doktor ekonomi dari UNU-MERIT/Maastricht Unive...,"[('Doktor', np.str_('NN')), ('ekonomi', np.str..."
2,Alumni generasi pertama beasiswa LPDP master-d...,"[('Alumni', np.str_('NNP')), ('generasi', np.s..."
3,"Pernah bekerja di ASEAN Secretariat, Indonesia...","[('Pernah', np.str_('ADV')), ('bekerja', np.st..."
4,"Saat ini berkiprah sebagai akademisi, peneliti...","[('Saat', np.str_('SC')), ('ini', np.str_('DT'..."
...,...,...
2799,Koordinasi dilakukan oleh salah satu keluarga.,"[('Koordinasi', np.str_('NN')), ('dilakukan', ..."
2800,"Singapura, Dubai, Hong Kong sudah mempratikkan...","[('Singapura', np.str_('NNP')), (',', 'SYM'), ..."
2801,"Untuk mendirikan satu Family Office, biaya yan...","[('Untuk', np.str_('SC')), ('mendirikan', np.s..."
2802,Ini berarti bahwa untuk keluarga dengan aset ...,"[('Ini', np.str_('PR')), ('berarti', np.str_('..."


In [38]:
with open(wordlist_path1, 'r') as file:
    asset_classes = file.read().splitlines()

with open(wordlist_path2, 'r') as file:
    economic_indicators = file.read().splitlines()

# Postag RegexParser

In [None]:
"""
Using this one methode is so challenging because of the ambiguity of the result
So we moved on the other method using BERT instead
"""
grammar = r"""
  PERSON: <VB>{<NNP>+}
          <NNP><NNP>?<NNP>?<NNP>?<NNP>?<SYM>{<NNP><NNP>?}
          {<NNP>+}<VB>
          <IN>{<NNP><NNP>?}
  FINANCIAL_ENTITY: <IN>{<NNP>+}
            {<NNP>+<CC><NNP>+}
            {<NNP>+}<NNP>
  CURRENCY: {<SYM><NUM>+}
            {<SYM><NUM><NNP>}
  PERCENTAGE: {<NUM><NUM>}
  DATE: <IN>{<NUM>?<NNP>?<NUM>}
"""

In [97]:
def tag_with_context(tokens, keywords_list, tag_prefix_list):
    """
    Tag tokens with custom tags based on keywords from multiple categories.

    Parameters:
        tokens: List of tuples [(word, pos), ...] representing input tokens.
        keywords_list: List of keyword lists for tagging.
        tag_prefix_list: List of tag prefixes corresponding to each keyword list.

    Returns:
        List of tuples [(word, tag), ...] with tagged tokens.
    """
    tagged = []
    
    for word, pos in tokens:
        found_tag = 'O'
        
        for keywords, tag_prefix in zip(keywords_list, tag_prefix_list):
            if word.lower() in keywords:
                found_tag = f'B-{tag_prefix}'
            elif len(tagged) > 0 and tagged[-1][1] == f'B-{tag_prefix}' and (pos == 'NNP' or pos == 'NN'):
                found_tag = f'I-{tag_prefix}'
        
        tagged.append((word, found_tag))
    
    return tagged

In [None]:
def combine_tags_with_context(tree, economic_tagged, asset_class_tagged):
    combined = []
    economic_words = {word for word, tag in economic_tagged if tag.startswith('B-')}
    asset_words = {word for word, tag in asset_class_tagged if tag.startswith('B-')}
    
    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            label = subtree.label()
            for word, pos in subtree.leaves():
                if word in economic_words:
                    combined.append((word, 'B-ECONOMIC_INDICATOR'))
                elif word in asset_words:
                    combined.append((word, 'B-ASSET_CLASS'))
                else:
                    combined.append((word, f'B-{label}'))
        else:
            word, pos = subtree
            if word in economic_words:
                combined.append((word, 'B-ECONOMIC_INDICATOR'))
            elif word in asset_words:
                combined.append((word, 'B-ASSET_CLASS'))
            else:
                combined.append((word, 'O'))
    
    for i in range(1, len(combined)):
        if combined[i - 1][1] == 'B-ECONOMIC_INDICATOR' and combined[i][1] == 'O' and combined[i][0] == 'NNP' or combined[i][0] == 'NN':
            combined[i] = (combined[i][0], 'I-ECONOMIC_INDICATOR')
        elif combined[i - 1][1] == 'B-ASSET_CLASS' and combined[i][1] == 'O' and combined[i][0] == 'NNP' or combined[i][0] == 'NN':
            combined[i] = (combined[i][0], 'I-ASSET_CLASS')

    return combined

In [41]:
def get_token_tag_pair(tagged_sentence):
    pairs = []
    
    pattern = r"'(\S+)',\s*np\.str_\('(\w+)'\)"

    for i, sentence in enumerate(tagged_sentence):
        print(f"Processing sentence {i}...")
        results = re.findall(pattern, sentence)

        pairs.append(results)

    return pairs

In [43]:
pair_tags = get_token_tag_pair(news_data['pos_tag'])

Processing sentence 0...
Processing sentence 1...
Processing sentence 2...
Processing sentence 3...
Processing sentence 4...
Processing sentence 5...
Processing sentence 6...
Processing sentence 7...
Processing sentence 8...
Processing sentence 9...
Processing sentence 10...
Processing sentence 11...
Processing sentence 12...
Processing sentence 13...
Processing sentence 14...
Processing sentence 15...
Processing sentence 16...
Processing sentence 17...
Processing sentence 18...
Processing sentence 19...
Processing sentence 20...
Processing sentence 21...
Processing sentence 22...
Processing sentence 23...
Processing sentence 24...
Processing sentence 25...
Processing sentence 26...
Processing sentence 27...
Processing sentence 28...
Processing sentence 29...
Processing sentence 30...
Processing sentence 31...
Processing sentence 32...
Processing sentence 33...
Processing sentence 34...
Processing sentence 35...
Processing sentence 36...
Processing sentence 37...
Processing sentence 38

In [354]:
parser = RegexpParser(grammar)

In [53]:
sentence_ner_tagged = []
for item in pair_tags:
    
    tagged_economic_indicators = tag_with_context(item, economic_indicators, "ECONOMIC_INDICATOR")
    print(tagged_economic_indicators)
    tagged_asset_classes = tag_with_context(item, asset_classes, "ASSET_CLASS")
    print(tagged_asset_classes)

    # tree = parser.parse(item)
    # result = combine_tags_with_context(tree, tagged_economic_indicators, tagged_asset_classes)
    
    # sentence_ner_tagged.append(result)

[('Tantangan', 'O'), ('Ketimpangan', 'B-ECONOMIC_INDICATOR'), ('Ekonomi', 'I-ECONOMIC_INDICATOR')]
[('Tantangan', 'O'), ('Ketimpangan', 'O'), ('Ekonomi', 'O')]
[('Doktor', 'O'), ('ekonomi', 'O'), ('dari', 'O'), ('UNU-MERIT', 'O'), ('Maastricht', 'O'), ('University', 'O'), ('Belanda', 'O')]
[('Doktor', 'O'), ('ekonomi', 'O'), ('dari', 'O'), ('UNU-MERIT', 'O'), ('Maastricht', 'O'), ('University', 'O'), ('Belanda', 'O')]
[('Alumni', 'O'), ('generasi', 'O'), ('pertama', 'O'), ('beasiswa', 'O'), ('LPDP', 'O'), ('master-doktor', 'O')]
[('Alumni', 'O'), ('generasi', 'O'), ('pertama', 'O'), ('beasiswa', 'O'), ('LPDP', 'O'), ('master-doktor', 'O')]
[('Pernah', 'O'), ('bekerja', 'O'), ('di', 'O'), ('ASEAN', 'O'), ('Secretariat', 'O'), ('Indonesia', 'O'), ('Mengajar', 'O'), ('dan', 'O'), ('konsultan', 'O'), ('marketing', 'O')]
[('Pernah', 'O'), ('bekerja', 'O'), ('di', 'O'), ('ASEAN', 'O'), ('Secretariat', 'O'), ('Indonesia', 'O'), ('Mengajar', 'O'), ('dan', 'O'), ('konsultan', 'O'), ('marketing'

In [356]:
sentence_ner_tagged

[[('Tantangan', 'O'),
  ('Ketimpangan', 'B-ECONOMIC_INDICATOR'),
  ('Ekonomi', 'O')],
 [('Doktor', 'O'),
  ('ekonomi', 'O'),
  ('dari', 'O'),
  ('UNU-MERIT', 'B-PERSON'),
  ('Maastricht', 'B-PERSON'),
  ('University', 'B-FINANCIAL_ENTITY'),
  ('Belanda', 'O')],
 [('Alumni', 'O'),
  ('generasi', 'O'),
  ('pertama', 'O'),
  ('beasiswa', 'O'),
  ('LPDP', 'O'),
  ('master-doktor', 'O')],
 [('Pernah', 'O'),
  ('bekerja', 'O'),
  ('di', 'O'),
  ('ASEAN', 'B-PERSON'),
  ('Secretariat', 'B-PERSON'),
  ('Indonesia', 'B-FINANCIAL_ENTITY'),
  ('Mengajar', 'O'),
  ('dan', 'O'),
  ('konsultan', 'O'),
  ('marketing', 'O')],
 [('Saat', 'O'),
  ('ini', 'O'),
  ('berkiprah', 'O'),
  ('sebagai', 'O'),
  ('akademisi', 'O'),
  ('peneliti', 'O'),
  ('dan', 'O'),
  ('konsultan', 'O')],
 [('Tertarik', 'O'),
  ('dengan', 'O'),
  ('berbagai', 'O'),
  ('topik', 'O'),
  ('ekonomi', 'O'),
  ('pembangunan', 'O'),
  ('berkelanjutan', 'O'),
  ('pembangunan', 'O'),
  ('internasional', 'O'),
  ('Asia', 'B-FINANCIAL_EN

In [357]:
# Menyimpan hasil NER ke dalam file JSON
with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(sentence_ner_tagged, f, ensure_ascii=False, indent=4)

# Bert Base Indonesian NER

In [None]:
# First Try
from transformers import pipeline

pipe = pipeline("token-classification", model="cahya/bert-base-indonesian-NER")

Some weights of the model checkpoint at cahya/bert-base-indonesian-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
results = []
for sentence in news_data['sentence']:
    tag_predict = pipe(sentence)
    results.append(tag_predict)

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_name = "cahya/bert-base-indonesian-NER"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

labels = model.config.id2label
# Checking any label that used for this model
labels

Some weights of the model checkpoint at cahya/bert-base-indonesian-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{0: 'B-CRD',
 1: 'B-DAT',
 2: 'B-EVT',
 3: 'B-FAC',
 4: 'B-GPE',
 5: 'B-LAN',
 6: 'B-LAW',
 7: 'B-LOC',
 8: 'B-MON',
 9: 'B-NOR',
 10: 'B-ORD',
 11: 'B-ORG',
 12: 'B-PER',
 13: 'B-PRC',
 14: 'B-PRD',
 15: 'B-QTY',
 16: 'B-REG',
 17: 'B-TIM',
 18: 'B-WOA',
 19: 'I-CRD',
 20: 'I-DAT',
 21: 'I-EVT',
 22: 'I-FAC',
 23: 'I-GPE',
 24: 'I-LAN',
 25: 'I-LAW',
 26: 'I-LOC',
 27: 'I-MON',
 28: 'I-NOR',
 29: 'I-ORD',
 30: 'I-ORG',
 31: 'I-PER',
 32: 'I-PRC',
 33: 'I-PRD',
 34: 'I-QTY',
 35: 'I-REG',
 36: 'I-TIM',
 37: 'I-WOA',
 38: 'O'}

# Pipeline RegexParser & BERT

In [None]:
def merge_ner_results_with_reconstruction(tokens, custom_tags, pipe, selected_tags):
    """
    Merge custom tagging results(RegexParser) with pipeline predictions (cahya/bert-base-indonesian-NER).
    
    Params:
        tokens: List of tuples [(word, pos), ...] representing input tokens that already tagged with pos tagging.
        custom_tags: List of tuples [(word, tag)] from custom tagging function.
        pipe: The HuggingFace NER pipeline model.
        selected_tags: List of tags to include from the pipeline predictions.
    
    Returns:
        Merged list of tuples [(word, BIO_tag), ...].
    """
    words = [word for word, _ in tokens]
    original_text = " ".join(words)
    
    pipe_predictions = pipe(original_text)
    
    # Define init tag with O for every word
    reconstructed_tags = ["O"] * len(words)
    current_word_index = 0
    current_word = words[current_word_index]
    reconstructed_word = ""
    
    for entity in pipe_predictions:
        token = entity["word"]
        tag = entity["entity"]
        
        if token.startswith("##"):
            reconstructed_word += token[2:]
        else:
            if reconstructed_word != "":
                current_word = reconstructed_word
                reconstructed_word = token
            else:
                reconstructed_word = token
            
            while current_word.lower() != reconstructed_word.lower() and current_word_index < len(words) - 1:
                current_word_index += 1
                current_word = words[current_word_index]
        
        # Assign the tag to the corresponding token
        if tag.split("-")[-1] in selected_tags:
            if tag.startswith("B"):
                reconstructed_tags[current_word_index] = f"B-{tag.split('-')[-1]}"
            elif tag.startswith("I"):
                reconstructed_tags[current_word_index] = f"I-{tag.split('-')[-1]}"

    # Merge with custom tags
    merged_tags = []
    for i, (word, _) in enumerate(tokens):
        custom_tag = custom_tags[i][1]
        pipe_tag = reconstructed_tags[i]

        # Prioritize custom tagging
        if custom_tag != "O":
            merged_tags.append((word, custom_tag))
        else:
            merged_tags.append((word, pipe_tag))
    
    return merged_tags


## Tag List
- DAT
- EVT
- GPE
- LOC
- PER
- PRD
- PRC
- CRD
- TIM
- NOR
- ECONOMIC_INDICATOR
- ASSET_CLASS

In [None]:
keywords_list = [economic_indicators, asset_classes]
tag_prefix_list = ["ECONOMIC_INDICATOR", "ASSET_CLASS"]

merge_results = {
    'token' : [],
    'tag' : []
}

for pair in pair_tags:
    # Tag using the combined function
    custom_tags = tag_with_context(pair, keywords_list, tag_prefix_list)

    # Filtered the tags with only tag we need
    selected_tags = ["DAT", "EVT", "GPE", "LOC", "PER", "PRD", "PRC", "CRD", "TIM", "NOR"]

    # Merge results
    result = merge_ner_results_with_reconstruction(pair, custom_tags, pipe, selected_tags)

    tokens = [token for (token, _) in result]
    tag = [tag for (_, tag) in result]
    merge_results['token'].append(tokens)
    merge_results['tag'].append(tag)

In [None]:
ner_result = pd.DataFrame(merge_results, columns=['token', 'tag'])

Unnamed: 0,token,tag
0,"[Tantangan, Ketimpangan, Ekonomi]","[O, B-ECONOMIC_INDICATOR, I-ECONOMIC_INDICATOR]"
1,"[Doktor, ekonomi, dari, UNU-MERIT, Maastricht,...","[O, O, O, O, O, O, O]"
2,"[Alumni, generasi, pertama, beasiswa, LPDP, ma...","[O, O, O, O, O, O]"
3,"[Pernah, bekerja, di, ASEAN, Secretariat, Indo...","[O, O, O, O, O, O, O, O, O, O]"
4,"[Saat, ini, berkiprah, sebagai, akademisi, pen...","[O, O, O, O, O, O, O, O]"
...,...,...
2797,"[Koordinasi, dilakukan, oleh, salah, satu, kel...","[O, O, O, O, O, O]"
2798,"[Singapura, Dubai, Hong, Kong, sudah, memprati...","[B-GPE, B-GPE, B-GPE, I-GPE, O, O, O, O, O, O,..."
2799,"[Untuk, mendirikan, satu, Family, Office, biay...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2800,"[Ini, berarti, bahwa, untuk, keluarga, dengan,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [153]:
ner_result.to_csv('../../data/FinanceNewsNER.csv')