## Steps in Building a N grams model

- Tokenize the dataset and get n grams
- Count the n grams
- Compute N-gram probability
- Generate token based on the n-gram model



In [14]:
import random, textwrap
from collections import defaultdict, Counter

## Tokenize the dataset and get n-grams


In [2]:
dataset = "Happy families are all alike; every unhappy family is unhappy in its own way. Everything was in confusion in the Oblonskys’ house. The wife had discovered that the husband was carrying on an intrigue with a French girl, who had been a governess in their family, and she had announced to her husband that she could not go on living in the same house with him. This position of affairs had now lasted three days, and not only the husband and wife themselves, but all the members of their family and household, were painfully conscious of it. Every person in the house felt that there was no sense in their living together, and that the stray people brought together by chance in any inn had more in common with one another than they, the members of the family and household of the Oblonskys. The wife did not leave her own room, the husband had not been at home for three days. The children ran wild all over the house; the English governess quarreled with the housekeeper, and wrote to a friend asking her to look out for a new situation for her; the man-cook had walked off the day before just at dinner time; the kitchen-maid, and the coachman had given warning"
print(dataset)



In [3]:
def generate_ngrams(text:str, n:int) -> list[tuple[str]]:
  n_grams = []
  # Using basic space tokenizer
  text_tokens = text.lower().split(" ")
  length_of_tokens = len(text_tokens)

  for i in range(length_of_tokens-n+1):
    token = tuple(text_tokens[i:i+n])
    n_grams.append(token)

  return n_grams

generate_ngrams(dataset, 3)


[('happy', 'families', 'are'),
 ('families', 'are', 'all'),
 ('are', 'all', 'alike;'),
 ('all', 'alike;', 'every'),
 ('alike;', 'every', 'unhappy'),
 ('every', 'unhappy', 'family'),
 ('unhappy', 'family', 'is'),
 ('family', 'is', 'unhappy'),
 ('is', 'unhappy', 'in'),
 ('unhappy', 'in', 'its'),
 ('in', 'its', 'own'),
 ('its', 'own', 'way.'),
 ('own', 'way.', 'everything'),
 ('way.', 'everything', 'was'),
 ('everything', 'was', 'in'),
 ('was', 'in', 'confusion'),
 ('in', 'confusion', 'in'),
 ('confusion', 'in', 'the'),
 ('in', 'the', 'oblonskys’'),
 ('the', 'oblonskys’', 'house.'),
 ('oblonskys’', 'house.', 'the'),
 ('house.', 'the', 'wife'),
 ('the', 'wife', 'had'),
 ('wife', 'had', 'discovered'),
 ('had', 'discovered', 'that'),
 ('discovered', 'that', 'the'),
 ('that', 'the', 'husband'),
 ('the', 'husband', 'was'),
 ('husband', 'was', 'carrying'),
 ('was', 'carrying', 'on'),
 ('carrying', 'on', 'an'),
 ('on', 'an', 'intrigue'),
 ('an', 'intrigue', 'with'),
 ('intrigue', 'with', 'a'),
 

### Count the n grams




In [11]:
def generate_ngrams_count(dataset:list[tuple[str]], n:int) -> dict[str, Counter]:
  ngram_counts = defaultdict(Counter)
  number_of_tokens = len(dataset)

  for token in dataset:
    context = " ".join(token[:-1])
    next_token = token[-1]
    ngram_counts[context][next_token] +=1

  return dict(ngram_counts)



tokens = generate_ngrams(dataset, 3)
print(generate_ngrams_count(tokens, 3))



## Compute probabilty

In [16]:
def generate_ngram_model(dataset:str, n:int) -> dict[str, dict[str, float]]:
  ngram_model = {}
  ngrams = generate_ngrams(dataset, n)
  ngrams_count = generate_ngrams_count(ngrams, n)

  for context, next_token in ngrams_count.items():
    print(sum(next_token.values()))

new_dataset = """
Whose woods these are I think I know.
His house is in the village though;
He will not see me stopping here
To watch his woods fill up with snow.

My little horse must think it queer
To stop without a farmhouse near
Between the woods and frozen lake
The darkest evening of the year.

He gives his harness bells a shake
To ask if there is some mistake.
The only other sound’s the sweep
Of easy wind and downy flake.

The woods are lovely, dark and deep,
But I have promises to keep,
And miles to go before I sleep,
And miles to go before I sleep.
"""

new_dataset = textwrap.dedent(new_dataset)

generate_ngram_model(new_dataset, 3)

1
1
1
1
1
1
1
1
15
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
1
1
