# Mining Data (Text)
Created by Atmam Al Faruq

Proses pertama dalam memulai membangun sistem yang menunjang NLP diperlukan data yang cukup. 
Perlu pengambil dan manajamen data dalam proses membangun sistem tersebut.

### Tokenization

pada proses ini akan dicoba membagi sebuah kalimat menjadi beberapa kata yang membangun kalimat tersebut.

In [1]:
import nltk
from nltk.tokenize import word_tokenize
import nltk.corpus
nltk.download('punkt')

kalimat = "Jika engkau tidak sanggup menahan lelahnya belajar, maka bersiaplah engkau dengan perihnya kebodohan"
tokens = word_tokenize(kalimat)
tokens

[nltk_data] Downloading package punkt to /home/not/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Jika',
 'engkau',
 'tidak',
 'sanggup',
 'menahan',
 'lelahnya',
 'belajar',
 ',',
 'maka',
 'bersiaplah',
 'engkau',
 'dengan',
 'perihnya',
 'kebodohan']

### Menemukan kata yang berbeda ( frequency distinct )

In [2]:
from nltk.probability import FreqDist

fdist = FreqDist(tokens)
fdist

FreqDist({'engkau': 2, 'Jika': 1, 'tidak': 1, 'sanggup': 1, 'menahan': 1, 'lelahnya': 1, 'belajar': 1, ',': 1, 'maka': 1, 'bersiaplah': 1, ...})

In [3]:
fdist_1 = fdist.most_common(5)
fdist_1

[('engkau', 2), ('Jika', 1), ('tidak', 1), ('sanggup', 1), ('menahan', 1)]

## Stemming

In [5]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

stemmer.stem(kalimat)

'jika engkau tidak sanggup tahan lelah ajar maka siap engkau dengan perih bodoh'

In [8]:
akar_kata = ["pelajaran","pelajar","pengajar"]

for kata in akar_kata:
    print(kata+" : "+stemmer.stem(kata))

pelajaran : ajar
pelajar : ajar
pengajar : ajar


In [6]:
from nltk.stem import LancasterStemmer

lst = LancasterStemmer()

for kata in akar_kata:
    print(kata+" : "+lst.stem(kata))

NameError: name 'akar_kata' is not defined

## Stemming english word

In [19]:
from nltk.stem import PorterStemmer

pst = PorterStemmer()

word = ["waited","waiting","waits"]

for stm_word in word:
    print(stm_word+" : "+pst.stem(stm_word))

waited : wait
waiting : wait
waits : wait


In [7]:
from nltk.stem import LancasterStemmer

lst = LancasterStemmer()

word = ["giving", "given", "given", "gave"]

for stm_word in word:
    print(stm_word+" : "+lst.stem(stm_word))

giving : giv
given : giv
given : giv
gave : gav


In [19]:
from nltk.stem import SnowballStemmer

print("_".join(SnowballStemmer.languages))

arabic_danish_dutch_english_finnish_french_german_hungarian_italian_norwegian_porter_portuguese_romanian_russian_spanish_swedish


In [9]:
snw = SnowballStemmer("english")

for snw_word in word:
    print(snw_word+" : "+snw.stem(stm_word))

giving : gave
given : gave
given : gave
gave : gave


## Lemmatization

In [21]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package wordnet to /home/not/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


corpora : corpus
influencer : influencer
affection : affection


In [22]:
word = ["corpora","ran","rocks"]

for lem_word in word:
    print(lem_word+" : "+lemmatizer.lemmatize(lem_word))

corpora : corpus
ran : ran
rocks : rock


In [23]:
for lem_word in word:
    print(lem_word+" : "+lemmatizer.lemmatize(lem_word, pos="v"))

corpora : corpora
ran : run
rocks : rock


## Stop Words

In [24]:
from nltk.corpus import stopwords

nltk.download('stopwords')

a = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /home/not/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [33]:
kalimat_2 = "Some television channels reported that police had imposed emergency law in some parts of the capital, New Delhi, that prohibits gatherings."
kalimat_2 = word_tokenize(kalimat_2.lower())
print(" Awal : ")
print(kalimat_2)
print("\n")
stopwords = [x for x in kalimat_2 if x not in a]
print(" Setelah proses : ")
print(stopwords)

 Awal : 
['some', 'television', 'channels', 'reported', 'that', 'police', 'had', 'imposed', 'emergency', 'law', 'in', 'some', 'parts', 'of', 'the', 'capital', ',', 'new', 'delhi', ',', 'that', 'prohibits', 'gatherings', '.']


 Setelah proses : 
['television', 'channels', 'reported', 'police', 'imposed', 'emergency', 'law', 'parts', 'capital', ',', 'new', 'delhi', ',', 'prohibits', 'gatherings', '.']


## Part of Speech tagging

In [36]:
nltk.download('averaged_perceptron_tagger')

kalimat_3 = "Critics say the exclusion of Muslims violates India's secular constitution by making religion a basis of citizenship."
tex = word_tokenize(kalimat_3)
for token in tex:
  print(nltk.pos_tag([token]))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/not/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Critics', 'NNS')]
[('say', 'VB')]
[('the', 'DT')]
[('exclusion', 'NN')]
[('of', 'IN')]
[('Muslims', 'NNS')]
[('violates', 'NNS')]
[('India', 'NNP')]
[("'s", 'POS')]
[('secular', 'NN')]
[('constitution', 'NN')]
[('by', 'IN')]
[('making', 'VBG')]
[('religion', 'NN')]
[('a', 'DT')]
[('basis', 'NN')]
[('of', 'IN')]
[('citizenship', 'NN')]
[('.', '.')]


## Mengidentifikasi Entitas

In [37]:
kalimat_4 = "Minhaj comes from a Muslim family originally from Aligarh in Uttar Pradesh, India. His parents, Najme and Seema."

from nltk import ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')

token = word_tokenize(kalimat_4)
tags = nltk.pos_tag(token)
chunk = ne_chunk(tags)
chunk

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/not/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /home/not/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


ModuleNotFoundError: No module named 'tkinter'

Tree('S', [Tree('GPE', [('Minhaj', 'NNP')]), ('comes', 'VBZ'), ('from', 'IN'), ('a', 'DT'), Tree('ORGANIZATION', [('Muslim', 'NNP')]), ('family', 'NN'), ('originally', 'RB'), ('from', 'IN'), Tree('GPE', [('Aligarh', 'NNP')]), ('in', 'IN'), Tree('GPE', [('Uttar', 'NNP'), ('Pradesh', 'NNP')]), (',', ','), Tree('GPE', [('India', 'NNP')]), ('.', '.'), ('His', 'PRP$'), ('parents', 'NNS'), (',', ','), Tree('PERSON', [('Najme', 'NNP')]), ('and', 'CC'), Tree('PERSON', [('Seema', 'NNP')]), ('.', '.')])

## Chunking

In [38]:
text = "We saw the yellow dog"
token = word_tokenize(text)
tags = nltk.pos_tag(token)
reg = "NP: {<DT>?<JJ>*<NN>}"
a = nltk.RegexpParser(reg)
result = a.parse(tags)
print(result)

(S We/PRP saw/VBD (NP the/DT yellow/JJ dog/NN))


In [40]:
tree = a.parse(result)

for subtree in tree.subtrees():
    print(subtree)

(S We/PRP saw/VBD (NP the/DT yellow/JJ dog/NN))
(NP the/DT yellow/JJ dog/NN)


In [41]:
tree.draw()