In [1]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

## N-Gram Modelleri

In [2]:
corpus = [
    "I love apple",
    "I love him",
    "I love NLP",
    "You love me",
    "He loves apple",
    "They love apple",
    "I love you and you love me"
]

In [3]:
# Amaç: 1 kelimeden sonra gelecek kelimeyi tahmin etmek -> metin türetmek/oluşturmak
# Bunun için N-Gram dil modelini kullanacağız.

In [4]:
# Tokenizasyon
tokens = [word_tokenize(sentence.lower()) for sentence in corpus]

print(tokens[:3])

[['i', 'love', 'apple'], ['i', 'love', 'him'], ['i', 'love', 'nlp']]


In [5]:
# bigram -> 2 li kelime grupları
bigrams = []
for token_list in tokens:
    bigrams.extend(list(ngrams(token_list, 2)))

bigrams

[('i', 'love'),
 ('love', 'apple'),
 ('i', 'love'),
 ('love', 'him'),
 ('i', 'love'),
 ('love', 'nlp'),
 ('you', 'love'),
 ('love', 'me'),
 ('he', 'loves'),
 ('loves', 'apple'),
 ('they', 'love'),
 ('love', 'apple'),
 ('i', 'love'),
 ('love', 'you'),
 ('you', 'and'),
 ('and', 'you'),
 ('you', 'love'),
 ('love', 'me')]

In [6]:
bigrams_freq = Counter(bigrams)
bigrams_freq

Counter({('i', 'love'): 4,
         ('love', 'apple'): 2,
         ('you', 'love'): 2,
         ('love', 'me'): 2,
         ('love', 'him'): 1,
         ('love', 'nlp'): 1,
         ('he', 'loves'): 1,
         ('loves', 'apple'): 1,
         ('they', 'love'): 1,
         ('love', 'you'): 1,
         ('you', 'and'): 1,
         ('and', 'you'): 1})

In [7]:
# trigram -> 3 li kelime grupları
trigrams = []
for token_list in tokens:
    trigrams.extend(list(ngrams(token_list, 3)))

trigrams

[('i', 'love', 'apple'),
 ('i', 'love', 'him'),
 ('i', 'love', 'nlp'),
 ('you', 'love', 'me'),
 ('he', 'loves', 'apple'),
 ('they', 'love', 'apple'),
 ('i', 'love', 'you'),
 ('love', 'you', 'and'),
 ('you', 'and', 'you'),
 ('and', 'you', 'love'),
 ('you', 'love', 'me')]

In [8]:
trigrams_freq = Counter(trigrams)
trigrams_freq

Counter({('you', 'love', 'me'): 2,
         ('i', 'love', 'apple'): 1,
         ('i', 'love', 'him'): 1,
         ('i', 'love', 'nlp'): 1,
         ('he', 'loves', 'apple'): 1,
         ('they', 'love', 'apple'): 1,
         ('i', 'love', 'you'): 1,
         ('love', 'you', 'and'): 1,
         ('you', 'and', 'you'): 1,
         ('and', 'you', 'love'): 1})

In [9]:
# I love bigram ından sonra "you" ve "apple" kelimelerinin gelme olasılıklarını hesaplayalım

bigram = ("i", "love")

# "i love you" olma olasılığı
prob_you = trigrams_freq[("i", "love", "you")] / bigrams_freq[bigram]
print(f"you kelimesinin olma olasılığı:{prob_you}\n")

# "i love apple" olma olasılığı
prob_apple = trigrams_freq[("i", "love", "apple")] / bigrams_freq[bigram]
print(f"apple kelimesinin olma olasılığı:{prob_apple}")

you kelimesinin olma olasılığı:0.25

apple kelimesinin olma olasılığı:0.25


## Hidden Markov Modeli

In [10]:
# Part Of Speech (POS): kelimelerin uygun sözcük türünü bulma çalışması

from nltk.tag import hmm
import warnings
warnings.filterwarnings("ignore")

In [11]:
train_data = [
    [("I","PRP"),("am","VBP"),("a","DT"),("teacher","NN")],
    [("You","PRP"),("are","VBP"),("a","DT"),("student","NN")]
]

In [12]:
# train HMM
trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train(train_data)

In [13]:
test_sentence = "I am a student".split()

tags = hmm_tagger.tag(test_sentence)
print("Yeni cümle",tags)

Yeni cümle [('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('student', 'NN')]


In [14]:
# veri seti
from nltk.corpus import conll2000
# nltk.download("conll2000")

In [15]:
train_data = conll2000.tagged_sents("train.txt")
test_data = conll2000.tagged_sents("test.txt")

print(f"Train Data: {len(train_data)}")
print(f"Test Data: {len(test_data)}")

train_data[:1]

Train Data: 8936
Test Data: 2012


[[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')]]

In [16]:
# Train HMM
trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train(train_data)

# Test HMM
test_sentence = "I like going to school".split()

tags = hmm_tagger.tag(test_sentence)
tags

[('I', 'PRP'),
 ('like', 'IN'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('school', 'NN')]

POS tagging (sözcük türü etiketleme) işlemini

| Etiket | Açılımı                          | Anlamı                              |
| ------ | -------------------------------- | ----------------------------------- |
| PRP    | Personal Pronoun                 | Ben, sen vb. zamir                  |
| IN     | Preposition                      | Edat / Preposition (in, on, at…)    |
| VBG    | Verb Gerund / Present Participle | -ing ile biten fiil (going, doing…) |
| TO     | “to” kelimesi                    | Fiilden önce gelen “to”             |
| NN     | Noun                             | Tekil isim (school, car…)           |

## Maximum Entropy Models

In [17]:
# classification problem: duygu analizi -> olumlu veya olumsuz olarak sınıflandırma

from nltk.classify import MaxentClassifier

In [18]:
train_data = [
    ({"love":True, "amazing":True, "happy":True, "terrible":False}, "positive"),
    ({"hate":True, "terrible":True}, "negative"),
    ({"joy":True, "happy":True, "hate":False}, "positive"),
    ({"sad":True, "depressed":True, "love":False}, "negative")
]

In [19]:
classifier = MaxentClassifier.train(train_data, max_iter=10)

  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.500
             2          -0.40641        1.000
             3          -0.28861        1.000
             4          -0.22397        1.000
             5          -0.18304        1.000
             6          -0.15479        1.000
             7          -0.13410        1.000
             8          -0.11829        1.000
             9          -0.10582        1.000
         Final          -0.09573        1.000


In [20]:
test_sentence = "I do not like this movie"

features = {word: (word in test_sentence.lower().split()) for word in ["love","amazing","terrible","happy","joy","sad","depressed"]}

label = classifier.classify(features)
print("Result:", label)

Result: negative
