In [1]:
import gzip
import os
import sqlite3
from random import randrange
from tqdm.notebook import tqdm

In [None]:
#!rm ./3gram.db

In [2]:
con = sqlite3.connect("./3gram.db")

In [None]:
# You can download wp_3gram.txt.gz from https://nlp.cs.nyu.edu/wikipedia-data/ngram/wp_3gram.txt.gz
# More information is available at https://nlp.cs.nyu.edu/wikipedia-data/

def get_3grams():
    filename = "./wp_3gram.txt.gz"
    size = os.path.getsize(filename)
    pbar = tqdm(total=size, unit='b', unit_scale=True, unit_divisor=1024)
    with gzip.open(filename, "rt", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip('\n')
            count, w1, w2, w3 = line.split("\t")
            yield (w1, w2, w3, count)
            pbar.update(f.buffer.fileobj.tell() - pbar.n)

con.execute("CREATE TABLE data(w1 text, w2 text, w3 text, count int)")
con.executemany("INSERT INTO data(w1, w2, w3, count) VALUES (?, ?, ?, ?)", get_3grams())
con.execute("CREATE UNIQUE INDEX data_w ON data (w1, w2, w3)")
con.commit()

**3-grams**: The quick brown fox jumps over the lazy dog

1. The quick brown
2. quick brown fox
3. brown fox jumps
4. fox jumps over
5. jumps over the
6. over the lazy
7. the lazy dog

In [3]:
con.execute("SELECT * FROM data LIMIT 5 OFFSET 50000000").fetchall()

[('and', 'intermediate', 'periods', 1),
 ('and', 'intermediate', 'categories', 2),
 ('and', 'intermediate', 'maintenance', 10),
 ('and', 'intermediate', 'speakers', 1),
 ('and', 'intermediate', 'championships', 1)]

In [4]:
con.execute("SELECT COUNT(*) FROM data").fetchone()

(376671416,)

In [5]:
def next_token(con, w1, w2):
    n, = con.execute("SELECT SUM(count) FROM data WHERE w1 = ? AND w2 = ?", (w1, w2)).fetchone()
    s = randrange(0, n)
    w, = con.execute("SELECT w3 FROM (SELECT w3, count, SUM(count) OVER (ORDER BY w1, w2, w3) AS end FROM data WHERE w1 = ? AND w2 = ?) WHERE end-count <= ? AND ? < end", (w1, w2, s, s)).fetchone()
    return w

def generate(prefix, n=20):
    print(prefix[0], end='')
    for w in prefix[1:]:
        print(' ' + w, end='')
    w1, w2 = prefix[-2], prefix[-1]
    for i in range(n):
        w3 = next_token(con, w1, w2)
        print(' ' + w3, end='')
        w1, w2 = w2, w3
        if w3 == "#EOS#":
            break
    print()

In [6]:
for i in range(10):
    generate(["The", "quick", "brown", "fox"])

The quick brown fox jumps over the Philadelphia Athletics ( 1957 ) and for this . #EOS#
The quick brown fox jumped over one thousand eighty - year - old county . #EOS#
The quick brown fox jumps over the complexes via two receptor isoforms with different developmental stages that do occur for great personal valor ,
The quick brown fox jumps over the Mediterranean Sea . #EOS#
The quick brown fox . #EOS#
The quick brown fox jumps over a month after it was torn down for redevelopment . #EOS#
The quick brown fox jumps over a combined carrier if the 149 miles ( 3.1 ) #EOS#
The quick brown fox jumps over the barbarism that ranks second in the Dictionary of London campaignchecked February 2007 . #EOS#
The quick brown fox jumps over the world was unconsciously producing the film industry . #EOS#
The quick brown fox jumps over the PCIe version ) would visit Southgate to form Socony - Vacuum ( Hell Gate #EOS#


In [7]:
for i in range(10):
    generate(["What", "is", "NASA"])

What is NASA ' s proposition is true , or being in Sudan #EOS#
What is NASA ' s father purposely let a Jew and a PS1 conversion of abandoned buildings , sometimes called " Bunny "
What is NASA ' s Printer , SilenType , Scribe ( LSS ) at an unknown source #EOS#
What is NASA ' s opponent , who would have to believe that Dreier has a rich miser , counted separately . #EOS#
What is NASA astronaut . #EOS#
What is NASA ' s Faraday Award ( Canucks Unsung Hero ) . #EOS#
What is NASA ' s successor . #EOS#
What is NASA ' s 1.08 #EOS#
What is NASA ' s dissertation , MIT ) , featuring collaborations with artists such as counting , the 75th Grey Cup date
What is NASA ' s death in 1968 when Pierre Trudeau serving out his regiments to form a sub - Mendip , Compton
