## POS Tagging dengan Flair

In [1]:
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings
from typing import List
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.data import Sentence
import pandas as pd

In [2]:
#load corpus
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_INDONESIAN)

2021-08-08 19:28:18,312 Reading data from C:\Users\Dwi Nissa\.flair\datasets\ud_indonesian
2021-08-08 19:28:18,313 Train: C:\Users\Dwi Nissa\.flair\datasets\ud_indonesian\id_gsd-ud-train.conllu
2021-08-08 19:28:18,313 Dev: C:\Users\Dwi Nissa\.flair\datasets\ud_indonesian\id_gsd-ud-dev.conllu
2021-08-08 19:28:18,314 Test: C:\Users\Dwi Nissa\.flair\datasets\ud_indonesian\id_gsd-ud-test.conllu


  corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_INDONESIAN)


In [3]:
#membuat dictionary corpus dengan tag upos
tag_type = 'upos'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [4]:
embedding_types : List[TokenEmbeddings] = [
    WordEmbeddings('id-crawl'),
    WordEmbeddings('id')
]

In [5]:
# membuat embed
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [6]:
# tagger-> menyatukan embed dan dictionary
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# Train Model (dilakukan di Google Collab, karna cukup berat)
"""
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('example-universal-pos',learning_rate=0.1,mini_batch_size=32,max_epochs=10)
"""

In [7]:
tag_pos = SequenceTagger.load('example-universal-pos/best-model.pt')

2021-08-08 19:29:06,684 loading file example-universal-pos/best-model.pt


In [8]:
# load functions
from nltk.tokenize import word_tokenize, sent_tokenize, wordpunct_tokenize
from copy import deepcopy
import pandas as pd

# rmv
def rmvAddText(txt):
    keywords = ['baca juga','tonton video','lihat juga video','simak video','simak juga','tonton juga',
                '[gambas:video 20detik]','foto:','editor:','reporter:','fotografer:','pewarta:',
                'foto :','editor :','reporter :','fotografer :','pewarta :']
    for keyword in keywords:
        arr = []
        ADD=False
        for x in sent_tokenize(txt):
            if keyword in x.lower().strip():
                ADD = True
            if ADD: ADD = False; continue
            else:
                if x.strip()!='': arr.append(x.strip());
        txt = ' '.join(arr).strip()
    return txt

def rmvTags(txt):
    TAGS = False; txt_arr = sent_tokenize(txt); idx = []; last = txt_arr[-1].strip()
    if '\n' in last or last.startswith('('): TAGS = True
    if TAGS: out = ' '.join(txt_arr[:-1])
    else: out = ' '.join(txt_arr)
    return out

def rmvASCII(contentRaw):
    return ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in contentRaw])

def processData(df,col):
    df_ = deepcopy(df)
    for i in range(len(df)):
        rawContent = df_[col].iloc[i]
        cleaned = rmvTags(rmvAddText(rawContent))
        cleaned = rmvASCII(cleaned)
        art_took = []
        for sent in sent_tokenize(cleaned):
             art_took = list(art_took)+word_tokenize(sent)
        df_[col].iloc[i] = ' '.join(art_took)
    return df_

def toTaggedContent(df,col):
    df_ = deepcopy(df)
    for i,x in enumerate(df_[col]):
        print("POS Tagging news item-{}/{}..".format(i+1,len(df_[col])))
        sentence = Sentence(x, use_tokenizer=False)
        tag_pos.predict(sentence)
        df_[col].iloc[i] = sentence.to_tagged_string()
    return df_

## Load Data

In [9]:
data = pd.read_csv("./../1. PREPROCESSING/OUTPUT/PRE_CLEAN_FIX.csv",index_col=0,encoding='ISO-8859-1')
data

Unnamed: 0,Link,Date Time,Title,Content,Label
3659,https://radarsidoarjo.jawapos.com/kriminal-del...,2020-12-31 15:07:00,"Ban Slip di Jalan Raya Seduri, Tiga Orang Luka...",SIDOARJO ÃÂÃÂ¢ÃÂÃÂÃÂÃÂ Sebuah truk ...,1.0
2813,https://radarmalang.jawapos.com/malang-raya/ko...,2020-12-30 02:56:00,"Diduga Melamun, Angkot Masuk Sungai di Arjosari",MALANG KOTA ÃÂÃÂ¢ÃÂÃÂÃÂÃÂ Mungkin h...,1.0
3033,https://radarbromo.jawapos.com/kraksaan/30/12/...,2020-12-30 00:00:00,"Menyeberang, Tukang Becak Tewas Tertabrak Truk...","KRAKSAAN, Radar Bromo ÃÂ¢ÃÂÃÂNahas menimp...",1.0
1287,https://radarjember.jawapos.com/berita-daerah/...,2020-12-29 14:10:00,"Waduh, kok Bisa Menerobos?\n",TEROBOS LINTASAN: Sebuah mobil Isuzu Panther b...,1.0
825,https://radarbromo.jawapos.com/kraksaan/29/12/...,2020-12-29 08:45:36,11 Bulan Ada 488 Kecelakaan yang Tewaskan 65 N...,"KRAKSAAN, Radar Bromo ÃÂÃÂ¢ÃÂÃÂÃÂÃÂ...",1.0
...,...,...,...,...,...
3658,https://radarbromo.jawapos.com/headlines/02/01...,2020-01-02 00:00:00,"Pulang Liburan dari Bali, Fortuner yang Ditump...","PASURUAN, Radar Bromo ÃÂ¢ÃÂÃÂ Agenda libu...",1.0
3657,https://radarbromo.jawapos.com/headlines/02/01...,2020-01-02 00:00:00,Satu Keluarga asal Blitar Kecelakaan di Tol Ge...,"GEMPOL, Radar BromoÃÂ¢ÃÂÃÂ Nahas nian nas...",1.0
507,https://news.detik.com/berita-jawa-timur/d-484...,2020-01-01 17:15:00,MPV Hantam Motor dan Terbalik di Jalur Blitar-...,Blitar - Kecelakaan lalu lintas melibatkan seb...,1.0
823,https://jatimnow.com/baca-22640-mobil-wisatawa...,2020-01-01 17:07:15,Mobil Wisatawan asal Kulon Progo Tabrak Pohon ...,jatimnow.com - Sebuah mobil Toyota Avanza meng...,1.0


## Preprocess data

In [10]:
data_pre = processData(data,'Content')
data_pre

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Link,Date Time,Title,Content,Label
3659,https://radarsidoarjo.jawapos.com/kriminal-del...,2020-12-31 15:07:00,"Ban Slip di Jalan Raya Seduri, Tiga Orang Luka...",SIDOARJO Sebuah truk Toyota Dyna nopol L 8894 ...,1.0
2813,https://radarmalang.jawapos.com/malang-raya/ko...,2020-12-30 02:56:00,"Diduga Melamun, Angkot Masuk Sungai di Arjosari",MALANG KOTA Mungkin hari ini ( 30/12 ) adalah ...,1.0
3033,https://radarbromo.jawapos.com/kraksaan/30/12/...,2020-12-30 00:00:00,"Menyeberang, Tukang Becak Tewas Tertabrak Truk...","KRAKSAAN , Radar Bromo Nahas menimpa Miskari 5...",1.0
1287,https://radarjember.jawapos.com/berita-daerah/...,2020-12-29 14:10:00,"Waduh, kok Bisa Menerobos?\n",TEROBOS LINTASAN : Sebuah mobil Isuzu Panther ...,1.0
825,https://radarbromo.jawapos.com/kraksaan/29/12/...,2020-12-29 08:45:36,11 Bulan Ada 488 Kecelakaan yang Tewaskan 65 N...,"KRAKSAAN , Radar Bromo Jumlah insiden kecelaka...",1.0
...,...,...,...,...,...
3658,https://radarbromo.jawapos.com/headlines/02/01...,2020-01-02 00:00:00,"Pulang Liburan dari Bali, Fortuner yang Ditump...","PASURUAN , Radar Bromo Agenda liburan rombonga...",1.0
3657,https://radarbromo.jawapos.com/headlines/02/01...,2020-01-02 00:00:00,Satu Keluarga asal Blitar Kecelakaan di Tol Ge...,"GEMPOL , Radar Bromo Nahas nian nasib yang dia...",1.0
507,https://news.detik.com/berita-jawa-timur/d-484...,2020-01-01 17:15:00,MPV Hantam Motor dan Terbalik di Jalur Blitar-...,Blitar - Kecelakaan lalu lintas melibatkan seb...,1.0
823,https://jatimnow.com/baca-22640-mobil-wisatawa...,2020-01-01 17:07:15,Mobil Wisatawan asal Kulon Progo Tabrak Pohon ...,jatimnow.com - Sebuah mobil Toyota Avanza meng...,1.0


## POS Tagging data

In [12]:
data_pos = toTaggedContent(data_pre,'Content')
data_pos

POS Tagging news item-1/1247..


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


POS Tagging news item-2/1247..
POS Tagging news item-3/1247..
POS Tagging news item-4/1247..
POS Tagging news item-5/1247..
POS Tagging news item-6/1247..
POS Tagging news item-7/1247..
POS Tagging news item-8/1247..
POS Tagging news item-9/1247..
POS Tagging news item-10/1247..
POS Tagging news item-11/1247..
POS Tagging news item-12/1247..
POS Tagging news item-13/1247..
POS Tagging news item-14/1247..
POS Tagging news item-15/1247..
POS Tagging news item-16/1247..
POS Tagging news item-17/1247..
POS Tagging news item-18/1247..
POS Tagging news item-19/1247..
POS Tagging news item-20/1247..
POS Tagging news item-21/1247..
POS Tagging news item-22/1247..
POS Tagging news item-23/1247..
POS Tagging news item-24/1247..
POS Tagging news item-25/1247..
POS Tagging news item-26/1247..
POS Tagging news item-27/1247..
POS Tagging news item-28/1247..
POS Tagging news item-29/1247..
POS Tagging news item-30/1247..
POS Tagging news item-31/1247..
POS Tagging news item-32/1247..
POS Tagging news

POS Tagging news item-255/1247..
POS Tagging news item-256/1247..
POS Tagging news item-257/1247..
POS Tagging news item-258/1247..
POS Tagging news item-259/1247..
POS Tagging news item-260/1247..
POS Tagging news item-261/1247..
POS Tagging news item-262/1247..
POS Tagging news item-263/1247..
POS Tagging news item-264/1247..
POS Tagging news item-265/1247..
POS Tagging news item-266/1247..
POS Tagging news item-267/1247..
POS Tagging news item-268/1247..
POS Tagging news item-269/1247..
POS Tagging news item-270/1247..
POS Tagging news item-271/1247..
POS Tagging news item-272/1247..
POS Tagging news item-273/1247..
POS Tagging news item-274/1247..
POS Tagging news item-275/1247..
POS Tagging news item-276/1247..
POS Tagging news item-277/1247..
POS Tagging news item-278/1247..
POS Tagging news item-279/1247..
POS Tagging news item-280/1247..
POS Tagging news item-281/1247..
POS Tagging news item-282/1247..
POS Tagging news item-283/1247..
POS Tagging news item-284/1247..
POS Taggin

POS Tagging news item-504/1247..
POS Tagging news item-505/1247..
POS Tagging news item-506/1247..
POS Tagging news item-507/1247..
POS Tagging news item-508/1247..
POS Tagging news item-509/1247..
POS Tagging news item-510/1247..
POS Tagging news item-511/1247..
POS Tagging news item-512/1247..
POS Tagging news item-513/1247..
POS Tagging news item-514/1247..
POS Tagging news item-515/1247..
POS Tagging news item-516/1247..
POS Tagging news item-517/1247..
POS Tagging news item-518/1247..
POS Tagging news item-519/1247..
POS Tagging news item-520/1247..
POS Tagging news item-521/1247..
POS Tagging news item-522/1247..
POS Tagging news item-523/1247..
POS Tagging news item-524/1247..
POS Tagging news item-525/1247..
POS Tagging news item-526/1247..
POS Tagging news item-527/1247..
POS Tagging news item-528/1247..
POS Tagging news item-529/1247..
POS Tagging news item-530/1247..
POS Tagging news item-531/1247..
POS Tagging news item-532/1247..
POS Tagging news item-533/1247..
POS Taggin

POS Tagging news item-753/1247..
POS Tagging news item-754/1247..
POS Tagging news item-755/1247..
POS Tagging news item-756/1247..
POS Tagging news item-757/1247..
POS Tagging news item-758/1247..
POS Tagging news item-759/1247..
POS Tagging news item-760/1247..
POS Tagging news item-761/1247..
POS Tagging news item-762/1247..
POS Tagging news item-763/1247..
POS Tagging news item-764/1247..
POS Tagging news item-765/1247..
POS Tagging news item-766/1247..
POS Tagging news item-767/1247..
POS Tagging news item-768/1247..
POS Tagging news item-769/1247..
POS Tagging news item-770/1247..
POS Tagging news item-771/1247..
POS Tagging news item-772/1247..
POS Tagging news item-773/1247..
POS Tagging news item-774/1247..
POS Tagging news item-775/1247..
POS Tagging news item-776/1247..
POS Tagging news item-777/1247..
POS Tagging news item-778/1247..
POS Tagging news item-779/1247..
POS Tagging news item-780/1247..
POS Tagging news item-781/1247..
POS Tagging news item-782/1247..
POS Taggin

POS Tagging news item-1002/1247..
POS Tagging news item-1003/1247..
POS Tagging news item-1004/1247..
POS Tagging news item-1005/1247..
POS Tagging news item-1006/1247..
POS Tagging news item-1007/1247..
POS Tagging news item-1008/1247..
POS Tagging news item-1009/1247..
POS Tagging news item-1010/1247..
POS Tagging news item-1011/1247..
POS Tagging news item-1012/1247..
POS Tagging news item-1013/1247..
POS Tagging news item-1014/1247..
POS Tagging news item-1015/1247..
POS Tagging news item-1016/1247..
POS Tagging news item-1017/1247..
POS Tagging news item-1018/1247..
POS Tagging news item-1019/1247..
POS Tagging news item-1020/1247..
POS Tagging news item-1021/1247..
POS Tagging news item-1022/1247..
POS Tagging news item-1023/1247..
POS Tagging news item-1024/1247..
POS Tagging news item-1025/1247..
POS Tagging news item-1026/1247..
POS Tagging news item-1027/1247..
POS Tagging news item-1028/1247..
POS Tagging news item-1029/1247..
POS Tagging news item-1030/1247..
POS Tagging ne

POS Tagging news item-1243/1247..
POS Tagging news item-1244/1247..
POS Tagging news item-1245/1247..
POS Tagging news item-1246/1247..
POS Tagging news item-1247/1247..


Unnamed: 0,Link,Date Time,Title,Content,Label
3659,https://radarsidoarjo.jawapos.com/kriminal-del...,2020-12-31 15:07:00,"Ban Slip di Jalan Raya Seduri, Tiga Orang Luka...",SIDOARJO <PROPN> Sebuah <DET> truk <NOUN> Toyo...,1.0
2813,https://radarmalang.jawapos.com/malang-raya/ko...,2020-12-30 02:56:00,"Diduga Melamun, Angkot Masuk Sungai di Arjosari",MALANG <PROPN> KOTA <PROPN> Mungkin <ADV> hari...,1.0
3033,https://radarbromo.jawapos.com/kraksaan/30/12/...,2020-12-30 00:00:00,"Menyeberang, Tukang Becak Tewas Tertabrak Truk...","KRAKSAAN <PROPN> , <PUNCT> Radar <PROPN> Bromo...",1.0
1287,https://radarjember.jawapos.com/berita-daerah/...,2020-12-29 14:10:00,"Waduh, kok Bisa Menerobos?\n",TEROBOS <NOUN> LINTASAN <NOUN> : <PUNCT> Sebua...,1.0
825,https://radarbromo.jawapos.com/kraksaan/29/12/...,2020-12-29 08:45:36,11 Bulan Ada 488 Kecelakaan yang Tewaskan 65 N...,"KRAKSAAN <PROPN> , <PUNCT> Radar <PROPN> Bromo...",1.0
...,...,...,...,...,...
3658,https://radarbromo.jawapos.com/headlines/02/01...,2020-01-02 00:00:00,"Pulang Liburan dari Bali, Fortuner yang Ditump...","PASURUAN <PROPN> , <PUNCT> Radar <PROPN> Bromo...",1.0
3657,https://radarbromo.jawapos.com/headlines/02/01...,2020-01-02 00:00:00,Satu Keluarga asal Blitar Kecelakaan di Tol Ge...,"GEMPOL <PROPN> , <PUNCT> Radar <PROPN> Bromo <...",1.0
507,https://news.detik.com/berita-jawa-timur/d-484...,2020-01-01 17:15:00,MPV Hantam Motor dan Terbalik di Jalur Blitar-...,Blitar <PROPN> - <PUNCT> Kecelakaan <NOUN> lal...,1.0
823,https://jatimnow.com/baca-22640-mobil-wisatawa...,2020-01-01 17:07:15,Mobil Wisatawan asal Kulon Progo Tabrak Pohon ...,jatimnow.com <PROPN> - <PUNCT> Sebuah <DET> mo...,1.0


In [13]:
# Tagged Result
data_pos

Unnamed: 0,Link,Date Time,Title,Content,Label
3659,https://radarsidoarjo.jawapos.com/kriminal-del...,2020-12-31 15:07:00,"Ban Slip di Jalan Raya Seduri, Tiga Orang Luka...",SIDOARJO <PROPN> Sebuah <DET> truk <NOUN> Toyo...,1.0
2813,https://radarmalang.jawapos.com/malang-raya/ko...,2020-12-30 02:56:00,"Diduga Melamun, Angkot Masuk Sungai di Arjosari",MALANG <PROPN> KOTA <PROPN> Mungkin <ADV> hari...,1.0
3033,https://radarbromo.jawapos.com/kraksaan/30/12/...,2020-12-30 00:00:00,"Menyeberang, Tukang Becak Tewas Tertabrak Truk...","KRAKSAAN <PROPN> , <PUNCT> Radar <PROPN> Bromo...",1.0
1287,https://radarjember.jawapos.com/berita-daerah/...,2020-12-29 14:10:00,"Waduh, kok Bisa Menerobos?\n",TEROBOS <NOUN> LINTASAN <NOUN> : <PUNCT> Sebua...,1.0
825,https://radarbromo.jawapos.com/kraksaan/29/12/...,2020-12-29 08:45:36,11 Bulan Ada 488 Kecelakaan yang Tewaskan 65 N...,"KRAKSAAN <PROPN> , <PUNCT> Radar <PROPN> Bromo...",1.0
...,...,...,...,...,...
3658,https://radarbromo.jawapos.com/headlines/02/01...,2020-01-02 00:00:00,"Pulang Liburan dari Bali, Fortuner yang Ditump...","PASURUAN <PROPN> , <PUNCT> Radar <PROPN> Bromo...",1.0
3657,https://radarbromo.jawapos.com/headlines/02/01...,2020-01-02 00:00:00,Satu Keluarga asal Blitar Kecelakaan di Tol Ge...,"GEMPOL <PROPN> , <PUNCT> Radar <PROPN> Bromo <...",1.0
507,https://news.detik.com/berita-jawa-timur/d-484...,2020-01-01 17:15:00,MPV Hantam Motor dan Terbalik di Jalur Blitar-...,Blitar <PROPN> - <PUNCT> Kecelakaan <NOUN> lal...,1.0
823,https://jatimnow.com/baca-22640-mobil-wisatawa...,2020-01-01 17:07:15,Mobil Wisatawan asal Kulon Progo Tabrak Pohon ...,jatimnow.com <PROPN> - <PUNCT> Sebuah <DET> mo...,1.0


In [14]:
# Export to .csv
data_pos.to_csv("./OUTPUT/POS_CLEAN.csv")