In [3]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
corpus = [
    "The cat sat on the mat.",
    "The dog sat on the mat.",
    "The cat slept on the mat.",
    "The dog slept on the bed.",
    "The cat played with the toy.",
    "The dog played with the ball."
]

In [None]:
'''
problem tanımı:
yeni bir dil modeli oluşturmak
amaç 1 kelimeden sonra gelecek kelimeyi tahmin etmek,metin türetmek/oluşturmak
bunun için n gram modelini kullanacağız
'''

In [8]:
# tokenization
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]
tokenized_corpus

[['the', 'cat', 'sat', 'on', 'the', 'mat', '.'],
 ['the', 'dog', 'sat', 'on', 'the', 'mat', '.'],
 ['the', 'cat', 'slept', 'on', 'the', 'mat', '.'],
 ['the', 'dog', 'slept', 'on', 'the', 'bed', '.'],
 ['the', 'cat', 'played', 'with', 'the', 'toy', '.'],
 ['the', 'dog', 'played', 'with', 'the', 'ball', '.']]

In [9]:
# bigram
bigrams_corpus = [list(ngrams(sentence, 2)) for sentence in tokenized_corpus]
bigrams_corpus

[[('the', 'cat'),
  ('cat', 'sat'),
  ('sat', 'on'),
  ('on', 'the'),
  ('the', 'mat'),
  ('mat', '.')],
 [('the', 'dog'),
  ('dog', 'sat'),
  ('sat', 'on'),
  ('on', 'the'),
  ('the', 'mat'),
  ('mat', '.')],
 [('the', 'cat'),
  ('cat', 'slept'),
  ('slept', 'on'),
  ('on', 'the'),
  ('the', 'mat'),
  ('mat', '.')],
 [('the', 'dog'),
  ('dog', 'slept'),
  ('slept', 'on'),
  ('on', 'the'),
  ('the', 'bed'),
  ('bed', '.')],
 [('the', 'cat'),
  ('cat', 'played'),
  ('played', 'with'),
  ('with', 'the'),
  ('the', 'toy'),
  ('toy', '.')],
 [('the', 'dog'),
  ('dog', 'played'),
  ('played', 'with'),
  ('with', 'the'),
  ('the', 'ball'),
  ('ball', '.')]]

In [14]:
bigram_freq = Counter([bigram for sublist in bigrams_corpus for bigram in sublist])
bigram_freq

Counter({('the', 'cat'): 3,
         ('cat', 'sat'): 1,
         ('sat', 'on'): 2,
         ('on', 'the'): 4,
         ('the', 'mat'): 3,
         ('mat', '.'): 3,
         ('the', 'dog'): 3,
         ('dog', 'sat'): 1,
         ('cat', 'slept'): 1,
         ('slept', 'on'): 2,
         ('dog', 'slept'): 1,
         ('the', 'bed'): 1,
         ('bed', '.'): 1,
         ('cat', 'played'): 1,
         ('played', 'with'): 2,
         ('with', 'the'): 2,
         ('the', 'toy'): 1,
         ('toy', '.'): 1,
         ('dog', 'played'): 1,
         ('the', 'ball'): 1,
         ('ball', '.'): 1})

In [15]:
# trigram
trigrams_corpus = [list(ngrams(sentence, 3)) for sentence in tokenized_corpus]
trigrams_corpus

[[('the', 'cat', 'sat'),
  ('cat', 'sat', 'on'),
  ('sat', 'on', 'the'),
  ('on', 'the', 'mat'),
  ('the', 'mat', '.')],
 [('the', 'dog', 'sat'),
  ('dog', 'sat', 'on'),
  ('sat', 'on', 'the'),
  ('on', 'the', 'mat'),
  ('the', 'mat', '.')],
 [('the', 'cat', 'slept'),
  ('cat', 'slept', 'on'),
  ('slept', 'on', 'the'),
  ('on', 'the', 'mat'),
  ('the', 'mat', '.')],
 [('the', 'dog', 'slept'),
  ('dog', 'slept', 'on'),
  ('slept', 'on', 'the'),
  ('on', 'the', 'bed'),
  ('the', 'bed', '.')],
 [('the', 'cat', 'played'),
  ('cat', 'played', 'with'),
  ('played', 'with', 'the'),
  ('with', 'the', 'toy'),
  ('the', 'toy', '.')],
 [('the', 'dog', 'played'),
  ('dog', 'played', 'with'),
  ('played', 'with', 'the'),
  ('with', 'the', 'ball'),
  ('the', 'ball', '.')]]

In [16]:
trigram_freq = Counter([trigram for sublist in trigrams_corpus for trigram in sublist])
trigram_freq

Counter({('the', 'cat', 'sat'): 1,
         ('cat', 'sat', 'on'): 1,
         ('sat', 'on', 'the'): 2,
         ('on', 'the', 'mat'): 3,
         ('the', 'mat', '.'): 3,
         ('the', 'dog', 'sat'): 1,
         ('dog', 'sat', 'on'): 1,
         ('the', 'cat', 'slept'): 1,
         ('cat', 'slept', 'on'): 1,
         ('slept', 'on', 'the'): 2,
         ('the', 'dog', 'slept'): 1,
         ('dog', 'slept', 'on'): 1,
         ('on', 'the', 'bed'): 1,
         ('the', 'bed', '.'): 1,
         ('the', 'cat', 'played'): 1,
         ('cat', 'played', 'with'): 1,
         ('played', 'with', 'the'): 2,
         ('with', 'the', 'toy'): 1,
         ('the', 'toy', '.'): 1,
         ('the', 'dog', 'played'): 1,
         ('dog', 'played', 'with'): 1,
         ('with', 'the', 'ball'): 1,
         ('the', 'ball', '.'): 1})

In [17]:
# model testing
bigram = ('the', 'cat')
trigram = ('the', 'cat', 'slept')

prob_slept = bigram_freq[bigram] / sum(bigram_freq.values())
prob_slept

0.08333333333333333

In [18]:
prob_played = trigram_freq[trigram] / sum(trigram_freq.values())
prob_played

0.03333333333333333