In [89]:
from nltk.chunk import RegexpParser
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
import nltk
import re

In [46]:
# path list
wordlist_path1 = '../../data/wordlist/asset_classes.txt'
wordlist_path2 = '../../data/wordlist/economic_indicators.txt'
news_data_path = '../../data/annotated/nlp_id_result.csv'

In [120]:
news_data = pd.read_csv(news_data_path, index_col=0)

In [121]:
na_indices = news_data[news_data.isna().any(axis=1)].index
print(na_indices)

Index([1491, 2443], dtype='int64')


In [122]:
news_data = news_data.dropna()

In [123]:
news_data

Unnamed: 0,sentence,pos_tag
0,Tantangan Ketimpangan Ekonomi,"[('Tantangan', np.str_('NN')), ('Ketimpangan',..."
1,Doktor ekonomi dari UNU-MERIT/Maastricht Unive...,"[('Doktor', np.str_('NN')), ('ekonomi', np.str..."
2,Alumni generasi pertama beasiswa LPDP master-d...,"[('Alumni', np.str_('NNP')), ('generasi', np.s..."
3,"Pernah bekerja di ASEAN Secretariat, Indonesia...","[('Pernah', np.str_('ADV')), ('bekerja', np.st..."
4,"Saat ini berkiprah sebagai akademisi, peneliti...","[('Saat', np.str_('SC')), ('ini', np.str_('DT'..."
...,...,...
2799,Koordinasi dilakukan oleh salah satu keluarga.,"[('Koordinasi', np.str_('NN')), ('dilakukan', ..."
2800,"Singapura, Dubai, Hong Kong sudah mempratikkan...","[('Singapura', np.str_('NNP')), (',', 'SYM'), ..."
2801,"Untuk mendirikan satu Family Office, biaya yan...","[('Untuk', np.str_('SC')), ('mendirikan', np.s..."
2802,Ini berarti bahwa untuk keluarga dengan aset ...,"[('Ini', np.str_('PR')), ('berarti', np.str_('..."


In [112]:
with open(wordlist_path1, 'r') as file:
    asset_classes = file.read().splitlines()

with open(wordlist_path2, 'r') as file:
    economic_indicators = file.read().splitlines()

In [183]:
grammar = r"""
  PERSON: <VB>{<NNP>+}
          <NNP><NNP>?<NNP>?<NNP>?<NNP>?<SYM>{<NNP><NNP>?}
          {<NNP>+}<VB>
          <IN>{<NNP><NNP>?}
  LOCATION: <IN>{<NNP>+}
  ORGANIZATION: {<NNP>+<CC><NNP>+}
  CURRENCY: {<SYM><NUM>+}
            {<SYM><NUM><NNP>}
  EVENT: {<NNP>+<NUM>}
  PERCENTAGE: {<NUM><SYM>}
  DATE: <IN>{<NUM>?<NNP>?<NUM>}
"""

In [114]:
def tag_with_context(tokens, keywords, tag_prefix):
    tagged = []
    for i, (word, pos) in enumerate(tokens):
        if word.lower() in keywords:
            tagged.append((word, f'B-{tag_prefix}'))
        elif i > 0 and tagged[i - 1][1] == f'B-{tag_prefix}' and pos == 'NNP':
            tagged.append((word, f'I-{tag_prefix}'))
        else:
            tagged.append((word, 'O'))
    return tagged

In [186]:
def combine_tags_with_context(tree, economic_tagged, asset_class_tagged):
    combined = []
    economic_words = {word for word, tag in economic_tagged if tag.startswith('B-')}
    asset_words = {word for word, tag in asset_class_tagged if tag.startswith('B-')}
    
    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            label = subtree.label()
            for word, pos in subtree.leaves():
                if word in economic_words:
                    combined.append((word, 'B-ECONOMIC_INDICATOR'))
                elif word in asset_words:
                    combined.append((word, 'B-ASSET_CLASS'))
                else:
                    combined.append((word, f'B-{label}'))
        else:
            word, pos = subtree
            if word in economic_words:
                combined.append((word, 'B-ECONOMIC_INDICATOR'))
            elif word in asset_words:
                combined.append((word, 'B-ASSET_CLASS'))
            else:
                combined.append((word, 'O'))
    
    # Terapkan logika untuk mengubah tag di sebelah B- menjadi I-
    for i in range(1, len(combined)):
        if combined[i - 1][1] == 'B-ECONOMIC_INDICATOR' and combined[i][1] == 'O' and (combined[i][0] == 'NNP' or combined[i][0] == 'NN'):
            combined[i] = (combined[i][0], 'I-ECONOMIC_INDICATOR')
        elif combined[i - 1][1] == 'B-ASSET_CLASS' and combined[i][1] == 'O' and (combined[i][0] == 'NNP' or combined[i][0] == 'NN'):
            combined[i] = (combined[i][0], 'I-ASSET_CLASS')

    return combined

In [172]:
def get_token_tag_pair(tagged_sentence):
    pairs = []
    
    pattern = r"'(\S+)',\s*np\.str_\('(\w+)'\)"

    for i, sentence in enumerate(tagged_sentence):
        print(f"Processing sentence {i}...")
        results = re.findall(pattern, sentence)

        pairs.append(results)

    return pairs

In [173]:
pair_tags = get_token_tag_pair(news_data['pos_tag'])

Processing sentence 0...
Processing sentence 1...
Processing sentence 2...
Processing sentence 3...
Processing sentence 4...
Processing sentence 5...
Processing sentence 6...
Processing sentence 7...
Processing sentence 8...
Processing sentence 9...
Processing sentence 10...
Processing sentence 11...
Processing sentence 12...
Processing sentence 13...
Processing sentence 14...
Processing sentence 15...
Processing sentence 16...
Processing sentence 17...
Processing sentence 18...
Processing sentence 19...
Processing sentence 20...
Processing sentence 21...
Processing sentence 22...
Processing sentence 23...
Processing sentence 24...
Processing sentence 25...
Processing sentence 26...
Processing sentence 27...
Processing sentence 28...
Processing sentence 29...
Processing sentence 30...
Processing sentence 31...
Processing sentence 32...
Processing sentence 33...
Processing sentence 34...
Processing sentence 35...
Processing sentence 36...
Processing sentence 37...
Processing sentence 38

In [174]:
parser = RegexpParser(grammar)

In [187]:
sentence_ner_tagged = []
for item in pair_tags:
    tagged_economic_indicators = tag_with_context(item, economic_indicators, "ECONOMIC_INDICATOR")
    tagged_asset_classes = tag_with_context(item, asset_classes, "ASSET_CLASS")

    tree = parser.parse(item)
    result = combine_tags_with_context(tree, tagged_economic_indicators, tagged_asset_classes)
    
    sentence_ner_tagged.append(result)

In [188]:
sentence_ner_tagged

[[('Tantangan', 'O'),
  ('Ketimpangan', 'B-ORGANIZATION'),
  ('Ekonomi', 'B-ORGANIZATION')],
 [('Doktor', 'O'),
  ('ekonomi', 'O'),
  ('dari', 'O'),
  ('UNU-MERIT', 'B-ORGANIZATION'),
  ('Maastricht', 'B-ORGANIZATION'),
  ('University', 'B-ORGANIZATION'),
  ('Belanda', 'B-ORGANIZATION')],
 [('Alumni', 'B-ORGANIZATION'),
  ('generasi', 'O'),
  ('pertama', 'O'),
  ('beasiswa', 'O'),
  ('LPDP', 'B-ORGANIZATION'),
  ('master-doktor', 'O')],
 [('Pernah', 'O'),
  ('bekerja', 'O'),
  ('di', 'O'),
  ('ASEAN', 'B-ORGANIZATION'),
  ('Secretariat', 'B-ORGANIZATION'),
  ('Indonesia', 'B-ORGANIZATION'),
  ('Mengajar', 'B-ORGANIZATION'),
  ('dan', 'B-ORGANIZATION'),
  ('konsultan', 'O'),
  ('marketing', 'O')],
 [('Saat', 'O'),
  ('ini', 'O'),
  ('berkiprah', 'O'),
  ('sebagai', 'O'),
  ('akademisi', 'O'),
  ('peneliti', 'O'),
  ('dan', 'O'),
  ('konsultan', 'O')],
 [('Tertarik', 'O'),
  ('dengan', 'O'),
  ('berbagai', 'O'),
  ('topik', 'O'),
  ('ekonomi', 'O'),
  ('pembangunan', 'O'),
  ('berkelanju

In [None]:


# Matching daftar kata untuk economic indicators dan asset classes dengan konteks
tagged_economic_indicators = tag_with_context(pos_tags, economic_indicators, "ECONOMIC_INDICATOR")
tagged_asset_classes = tag_with_context(pos_tags, asset_classes, "ASSET_CLASS")

In [None]:
tree = parser.parse(pos_tags)
result = combine_tags_with_context(tree, tagged_economic_indicators, tagged_asset_classes)

In [94]:
pattern = r'\'(\S+)\''
sample = news_data['pos_tag'].iloc[1]
results = re.findall(pattern, sample)
print(sample)
print(results)

[('Doktor', np.str_('NN')), ('ekonomi', np.str_('NN')), ('dari', np.str_('IN')), ('UNU-MERIT', np.str_('NNP')), ('/', 'SYM'), ('Maastricht', np.str_('NNP')), ('University', np.str_('NNP')), ('(', 'SYM'), ('Belanda', np.str_('NNP')), (')', 'SYM'), ('.', 'SYM')]
['Doktor', 'NN', 'ekonomi', 'NN', 'dari', 'IN', 'UNU-MERIT', 'NNP', '/', 'SYM', 'Maastricht', 'NNP', 'University', 'NNP', '(', 'SYM', 'Belanda', 'NNP', ')', 'SYM', '.', 'SYM']


In [100]:
token_tag_pair = [(results[i], results[i+1]) for i in range(0, len(results), 2)]
token_tag_pair

[('Doktor', 'NN'),
 ('ekonomi', 'NN'),
 ('dari', 'IN'),
 ('UNU-MERIT', 'NNP'),
 ('/', 'SYM'),
 ('Maastricht', 'NNP'),
 ('University', 'NNP'),
 ('(', 'SYM'),
 ('Belanda', 'NNP'),
 (')', 'SYM'),
 ('.', 'SYM')]

In [31]:
# Kalimat POS tagging
sentence = [
    ('Jakarta', 'NNP'), ('tidak', 'RB'), ('hanya', 'RB'), ('menjadi', 'VB'),
    ('kota', 'NN'), ('global', 'JJ'), ('tetapi', 'CC'), ('juga', 'RB'),
    ('kota', 'NN'), ('yang', 'IN'), ('adil', 'JJ'), ('bagi', 'IN'),
    ('semuanya', 'NN'), ('ujar', 'VB'), ('Ridwan', 'NNP'), ('Kamil', 'NNP'),
    ('dalam', 'IN'), ('debat', 'NN'), ('Pilkada', 'NNP'), ('ketiga', 'JJ'),
    ('di', 'IN'), ('Hotel', 'NNP'), ('Sultan', 'NNP'), ('Jakarta', 'NNP'),
    ('Pusat', 'NNP'), ('Minggu', 'NNP'), ('17/11/2024', 'NUM'), ('.', '.')
]

# Parser
parser = RegexpParser(grammar)
tree = parser.parse(sentence)

# Menampilkan hasil chunking
tree.pretty_print()
def bio_tagging(tree):
    bio_tags = []
    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            label = subtree.label()
            for i, (word, tag) in enumerate(subtree.leaves()):
                prefix = "B-" if i == 0 else "I-"
                bio_tags.append((word, f"{prefix}{label}"))
        else:
            bio_tags.append((subtree[0], "O"))
    return bio_tags

# Contoh hasil
bio_tags = bio_tagging(tree)
print(bio_tags)

                                                                                                                            S                                                                                                                                                        
      ______________________________________________________________________________________________________________________|_____________________________________________________________________________________________________________________________                            
     |         |        |         |         |        |         |        |       |       |       |       |         |         |        |        |         |       |         |         |    PERSON            LOCATION                                    LOCATION                      
     |         |        |         |         |        |         |        |       |       |       |       |         |         |        |        |         |       |    

In [32]:
import nltk
from nltk.chunk import RegexpParser

sentence = [
    ('Menteri', 'NNP'), ('Pemberdayaan', 'NNP'), ('Perempuan', 'NNP'), 
    ('dan', 'CC'), ('Perlindungan', 'NNP'), ('Anak', 'NNP'), 
    ('PPPA', 'NNP'), ('Arifah', 'NNP'), ('Fauzi', 'NNP')
]

# Parser
parser = RegexpParser(grammar)
tree = parser.parse(sentence)

# Menampilkan hasil chunking
tree.pretty_print()
def bio_tagging(tree):
    bio_tags = []
    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            label = subtree.label()
            for i, (word, tag) in enumerate(subtree.leaves()):
                prefix = "B-" if i == 0 else "I-"
                bio_tags.append((word, f"{prefix}{label}"))
        else:
            bio_tags.append((subtree[0], "O"))
    return bio_tags

# Contoh hasil
bio_tags = bio_tagging(tree)
print(bio_tags)

                                                         S                                                 
                                    _____________________|________________________________                  
                              ORGANIZATION                                           ORGANIZATION          
      _____________________________|__________________________________          __________|___________      
Menteri/NNP Pemberdayaan/NNP Perempuan/NNP dan/CC Perlindungan/NNP Anak/NNP PPPA/NNP  Arifah/NNP  Fauzi/NNP

[('Menteri', 'B-ORGANIZATION'), ('Pemberdayaan', 'I-ORGANIZATION'), ('Perempuan', 'I-ORGANIZATION'), ('dan', 'I-ORGANIZATION'), ('Perlindungan', 'I-ORGANIZATION'), ('Anak', 'I-ORGANIZATION'), ('PPPA', 'B-ORGANIZATION'), ('Arifah', 'I-ORGANIZATION'), ('Fauzi', 'I-ORGANIZATION')]
