# Unigram

In [None]:
from collections import defaultdict

# Sample list of words (the corpus)
words = [
    'fifth','an','of','futures','the','an','incorporated','a',
    'a','the','inflation','most','dollars','quarter','in','is',
    'mass','thrift','did','eighty','said','hard','m','july','bullish',
    'that','or','limited','the'
]

# Function to build a Unigram model
def build_unigram_model(words):
    word_count = defaultdict(int)
    total_count = len(words)
    for word in words:
        word_count[word] += 1
    # Calculate probabilities
    probabilities = {word: count / total_count for word, count in word_count.items()}
    return probabilities

# Function to estimate the probability of a sentence
def estimate_sentence_probability(sentence, probabilities):
    words = sentence.split()
    prob = 1.0
    for word in words:
        prob *= probabilities.get(word, 0)
    return prob

# Build the Unigram model
unigram_model = build_unigram_model(words)

# Test sentence
test_sentence = 'the an incorporated'
probability = estimate_sentence_probability(test_sentence, unigram_model)

print(f'The probability of the sentence "{test_sentence}" is: {probability}')

# Display individual word probabilities
print('\nWord Probabilities:')
for word, prob in unigram_model.items():
    print(f'{word}: {prob}')

The probability of the sentence "the an incorporated" is: 0.00024601254663987864

Word Probabilities:
fifth: 0.034482758620689655
an: 0.06896551724137931
of: 0.034482758620689655
futures: 0.034482758620689655
the: 0.10344827586206896
incorporated: 0.034482758620689655
a: 0.06896551724137931
inflation: 0.034482758620689655
most: 0.034482758620689655
dollars: 0.034482758620689655
quarter: 0.034482758620689655
in: 0.034482758620689655
is: 0.034482758620689655
mass: 0.034482758620689655
thrift: 0.034482758620689655
did: 0.034482758620689655
eighty: 0.034482758620689655
said: 0.034482758620689655
hard: 0.034482758620689655
m: 0.034482758620689655
july: 0.034482758620689655
bullish: 0.034482758620689655
that: 0.034482758620689655
or: 0.034482758620689655
limited: 0.034482758620689655


In [None]:
from collections import defaultdict

# Sample list of words (the corpus)
words = [
    'the', 'quick', 'brown', 'fox',
    'jumps', 'over', 'the', 'lazy', 'dog',
    'the', 'dog', 'jumps', 'over', 'the', 'fence'
]

# Function to build a Unigram model
def build_unigram_model(words):
    word_count = defaultdict(int)
    total_count = len(words)
    for word in words:
        word_count[word] += 1
    # Calculate probabilities
    probabilities = {word: count / total_count for word, count in word_count.items()}
    return probabilities

# Function to estimate the probability of a sentence
def estimate_sentence_probability(sentence, probabilities):
    words = sentence.split()
    prob = 1.0
    for word in words:
        prob *= probabilities.get(word, 0)
    return prob

# Build the Unigram model
unigram_model = build_unigram_model(words)

# Test sentence
test_sentence = 'the quick'
probability = estimate_sentence_probability(test_sentence, unigram_model)

print(f'The probability of the sentence "{test_sentence}" is: {probability}')

# Display individual word probabilities
print('\nWord Probabilities:')
for word, prob in unigram_model.items():
    print(f'{word}: {prob}')

The probability of the sentence "the quick" is: 0.017777777777777778

Word Probabilities:
the: 0.26666666666666666
quick: 0.06666666666666667
brown: 0.06666666666666667
fox: 0.06666666666666667
jumps: 0.13333333333333333
over: 0.13333333333333333
lazy: 0.06666666666666667
dog: 0.13333333333333333
fence: 0.06666666666666667


# Bigram

In [None]:
from collections import defaultdict

# Sample list of words (the corpus)
# Sample list of words (the corpus)
words = [
    'fifth','an','of','futures','the','an','incorporated','a',
    'a','the','inflation','most','dollars','quarter','in','is',
    'mass','thrift','did','eighty','said','hard','m','july','bullish',
    'that','or','limited','the'
]

# Function to build a Bigram model
def build_bigram_model(words):
  bigram_count = defaultdict(lambda: defaultdict(int))
  unigram_count = defaultdict(int)

  for i in range(len(words) - 1):
    first_word = words[i]
    second_word = words[i + 1]
    bigram_count[first_word][second_word] += 1
    unigram_count[first_word] += 1

  # Calculate probabilities
  bigram_probabilities = {
      first_word: {second_word: count / unigram_count[first_word]
                   for second_word, count in second_word_counts.items()}
      for first_word, second_word_counts in bigram_count.items()
  }
  return bigram_probabilities

# Function to predict the next word
def predict_next_word(previous_word, bigram_probabilities):
  if previous_word in bigram_probabilities:
    return max(bigram_probabilities[previous_word], key=bigram_probabilities[previous_word].get)
  else:
    return None # No prediction available

# Build the Bigram model
bigram_model = build_bigram_model(words)

# Test the prediction
previous_word = 'in'
predicted_word = predict_next_word(previous_word, bigram_model)

print(f"Given the word '{previous_word}', the predicted next word is: '{predicted_word}'")

# Display bigram probabilities
print('\nBigram Probabilities:')
for first_word, second_words in bigram_model.items():
  for second_word, prob in second_words.items():
    print(f"P({second_word} | {first_word}) = {prob:.4f}")

Given the word 'in', the predicted next word is: 'is'

Bigram Probabilities:
P(an | fifth) = 1.0000
P(of | an) = 0.5000
P(incorporated | an) = 0.5000
P(futures | of) = 1.0000
P(the | futures) = 1.0000
P(an | the) = 0.5000
P(inflation | the) = 0.5000
P(a | incorporated) = 1.0000
P(a | a) = 0.5000
P(the | a) = 0.5000
P(most | inflation) = 1.0000
P(dollars | most) = 1.0000
P(quarter | dollars) = 1.0000
P(in | quarter) = 1.0000
P(is | in) = 1.0000
P(mass | is) = 1.0000
P(thrift | mass) = 1.0000
P(did | thrift) = 1.0000
P(eighty | did) = 1.0000
P(said | eighty) = 1.0000
P(hard | said) = 1.0000
P(m | hard) = 1.0000
P(july | m) = 1.0000
P(bullish | july) = 1.0000
P(that | bullish) = 1.0000
P(or | that) = 1.0000
P(limited | or) = 1.0000
P(the | limited) = 1.0000


In [None]:
from collections import defaultdict

# Sample list of words (the corpus)
words = [
    'the', 'quick', 'brown', 'fox',
    'jumps', 'over', 'the', 'lazy', 'dog',
    'the', 'dog', 'jumps', 'over', 'the', 'fence'
]

# Function to build a Bigram model
def build_bigram_model(words):
  bigram_count = defaultdict(lambda: defaultdict(int))
  unigram_count = defaultdict(int)

  for i in range(len(words) - 1):
    first_word = words[i]
    second_word = words[i + 1]
    bigram_count[first_word][second_word] += 1
    unigram_count[first_word] += 1

  # Calculate probabilities
  bigram_probabilities = {
      first_word: {second_word: count / unigram_count[first_word]
                   for second_word, count in second_word_counts.items()}
      for first_word, second_word_counts in bigram_count.items()
  }
  return bigram_probabilities

# Function to predict the next word
def predict_next_word(previous_word, bigram_probabilities):
  if previous_word in bigram_probabilities:
    return max(bigram_probabilities[previous_word],
               key=bigram_probabilities[previous_word].get)
  else:
    return None # No prediction available

# Build the Bigram model
bigram_model = build_bigram_model(words)

# Test the prediction
previous_word = 'brown'
predicted_word = predict_next_word(previous_word, bigram_model)

print(f"Given the word '{previous_word}', the predicted next word is: '{predicted_word}'")

# Display bigram probabilities
print('\nBigram Probabilities:')
for first_word, second_words in bigram_model.items():
  for second_word, prob in second_words.items():
    print(f"P({second_word} | {first_word}) = {prob:.4f}")

Given the word 'brown', the predicted next word is: 'fox'

Bigram Probabilities:
P(quick | the) = 0.2500
P(lazy | the) = 0.2500
P(dog | the) = 0.2500
P(fence | the) = 0.2500
P(brown | quick) = 1.0000
P(fox | brown) = 1.0000
P(jumps | fox) = 1.0000
P(over | jumps) = 1.0000
P(the | over) = 1.0000
P(dog | lazy) = 1.0000
P(the | dog) = 0.5000
P(jumps | dog) = 0.5000


# N-gram

In [None]:
from collections import defaultdict
import nltk
nltk.download('punkt')

# Sample sentence
sentence = 'the cat sat on the mat'

# Function to generate N-grams
def generate_ngrams(words, n):
  ngrams = []
  for i in range(len(words) - n + 1):
    ngrams.append(tuple(words[i:i+n]))
  return ngrams

# Function to build N-gram model
def build_ngram_model(words, n):
  ngram_count = defaultdict(int)
  for ngram in generate_ngrams(words, n):
    ngram_count[ngram] += 1

  total_ngrams = sum(ngram_count.values())
  ngram_probabilities = {ngram: count / total_ngrams for ngram, count in ngram_count.items()}
  return ngram_probabilities

# Tokenize the sentence
words = nltk.word_tokenize(sentence)

# Build N-gram models
unigram_model = build_ngram_model(words, 1)
bigram_model = build_ngram_model(words, 2)
trigram_model = build_ngram_model(words, 3)

# Display N-gram probabilities
print('Unigram Probabilities:')
for unigram, prob in unigram_model.items():
  print(f'P({unigram}) = {prob:.4f}')

print('\nBigram Probabilities:')
for bigram, prob in bigram_model.items():
  print(f'P({bigram}) = {prob:.4f}')

print('\nTrigram Probabilities:')
for trigram, prob in trigram_model.items():
  print(f'P({trigram}) = {prob:.4f}')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unigram Probabilities:
P(('the',)) = 0.3333
P(('cat',)) = 0.1667
P(('sat',)) = 0.1667
P(('on',)) = 0.1667
P(('mat',)) = 0.1667

Bigram Probabilities:
P(('the', 'cat')) = 0.2000
P(('cat', 'sat')) = 0.2000
P(('sat', 'on')) = 0.2000
P(('on', 'the')) = 0.2000
P(('the', 'mat')) = 0.2000

Trigram Probabilities:
P(('the', 'cat', 'sat')) = 0.2500
P(('cat', 'sat', 'on')) = 0.2500
P(('sat', 'on', 'the')) = 0.2500
P(('on', 'the', 'mat')) = 0.2500


# Count Keywords

In [None]:
from collections import defaultdict
import re

# Sample sentences from the Berkeley Restaurant Project
sentences = [
    "Can you tell me about any good Cantonese restaurants close by?",
    "Mid-priced Thai food is what I'm looking for.",
    "Tell me about Chez Panisse.",
    "Can you give me a listing of the kinds of food that are available?",
    "I'm looking for a good place to eat breakfast.",
    "When is Caffe Venezia open during the day?"
]

# Function to count keywords
def count_keywords(sentences):
  keywords = ['restaurant','food','breakfast','lunch','dinner','cuisine','menu']
  keyword_count = defaultdict(int)

  for sentence in sentences:
    # Normalize the sentence to lowercase and remove punctuation
    cleaned_sentence = re.sub(r'[^\w\s]','',sentence.lower())
    words = cleaned_sentence.split()

    for word in words:
      if word in keywords:
        keyword_count[word] += 1

  return keyword_count

# Count keywords
keyword_counts = count_keywords(sentences)

# Display keyword counts
for keyword, count in keyword_counts.items():
  print(f'{keyword}: {count}')

food: 2
breakfast: 1
