# Imports And Downloads

In [1]:
import sys
import subprocess

subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])

0

In [2]:
import nltk
import sys

nltk.download('punkt', quiet=True)

True

In [3]:
import sys
import subprocess
import nltk

# Install nltk
subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])

# Download tokenizer
nltk.download('punkt', quiet=True)

True

In [4]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams, everygrams
from collections import Counter

# Text Cleaning 

In [5]:
def clean_text(text, lowercase=True, remove_punct=True):
    if lowercase:
        text = text.lower()
    if remove_punct:
        text = re.sub(r"[^\w\s]", "", text)
    return text

# Pure Python N-Grams

In [6]:
def generate_ngrams_python(text, n, lowercase=True, remove_punct=True):
    text = clean_text(text, lowercase, remove_punct)
    words = text.split()
    return [" ".join(words[i:i+n]) for i in range(len(words) - n + 1)]

# Example

In [7]:
text = "In economics, inflation (or less frequently, price inflation) is a general rise in the price level in an economy over a period of time, resulting in a sustained drop in the purchasing power of money. When the general price level rises, each unit of currency buys fewer goods and services; consequently, inflation reflects a reduction in the purchasing power per unit of money – a loss of real value in the medium of exchange and unit of account within the economy. The opposite of inflation is deflation, a sustained decrease in the general price level of goods and services. The common measure of inflation is the inflation rate, the annualized percentage change in a general price index, usually the consumer price index, over time. Economists generally believe that very high rates of inflation and hyperinflation are harmful, and are caused by an excessive growth of the money supply"

python_trigrams = generate_ngrams_python(text, 3)
python_trigrams

['in economics inflation',
 'economics inflation or',
 'inflation or less',
 'or less frequently',
 'less frequently price',
 'frequently price inflation',
 'price inflation is',
 'inflation is a',
 'is a general',
 'a general rise',
 'general rise in',
 'rise in the',
 'in the price',
 'the price level',
 'price level in',
 'level in an',
 'in an economy',
 'an economy over',
 'economy over a',
 'over a period',
 'a period of',
 'period of time',
 'of time resulting',
 'time resulting in',
 'resulting in a',
 'in a sustained',
 'a sustained drop',
 'sustained drop in',
 'drop in the',
 'in the purchasing',
 'the purchasing power',
 'purchasing power of',
 'power of money',
 'of money when',
 'money when the',
 'when the general',
 'the general price',
 'general price level',
 'price level rises',
 'level rises each',
 'rises each unit',
 'each unit of',
 'unit of currency',
 'of currency buys',
 'currency buys fewer',
 'buys fewer goods',
 'fewer goods and',
 'goods and services',
 'a

# NLTK Word N-Grams

In [8]:
def generate_ngrams_nltk(text, n, lowercase=True, remove_punct=True):
    text = clean_text(text, lowercase, remove_punct)
    tokens = word_tokenize(text)
    return [" ".join(gram) for gram in ngrams(tokens, n)]

# Example

In [9]:
text = "In economics, inflation (or less frequently, price inflation) is a general rise in the price level in an economy over a period of time, resulting in a sustained drop in the purchasing power of money. When the general price level rises, each unit of currency buys fewer goods and services; consequently, inflation reflects a reduction in the purchasing power per unit of money – a loss of real value in the medium of exchange and unit of account within the economy. The opposite of inflation is deflation, a sustained decrease in the general price level of goods and services. The common measure of inflation is the inflation rate, the annualized percentage change in a general price index, usually the consumer price index, over time. Economists generally believe that very high rates of inflation and hyperinflation are harmful, and are caused by an excessive growth of the money supply"

nltk_trigrams = generate_ngrams_nltk(text, 3)
nltk_trigrams

['in economics inflation',
 'economics inflation or',
 'inflation or less',
 'or less frequently',
 'less frequently price',
 'frequently price inflation',
 'price inflation is',
 'inflation is a',
 'is a general',
 'a general rise',
 'general rise in',
 'rise in the',
 'in the price',
 'the price level',
 'price level in',
 'level in an',
 'in an economy',
 'an economy over',
 'economy over a',
 'over a period',
 'a period of',
 'period of time',
 'of time resulting',
 'time resulting in',
 'resulting in a',
 'in a sustained',
 'a sustained drop',
 'sustained drop in',
 'drop in the',
 'in the purchasing',
 'the purchasing power',
 'purchasing power of',
 'power of money',
 'of money when',
 'money when the',
 'when the general',
 'the general price',
 'general price level',
 'price level rises',
 'level rises each',
 'rises each unit',
 'each unit of',
 'unit of currency',
 'of currency buys',
 'currency buys fewer',
 'buys fewer goods',
 'fewer goods and',
 'goods and services',
 'a

# All-Grams (1 → N)

In [10]:
# ---------- Everygrams (1 → N) ----------
def generate_everygrams(text, max_n=4, lowercase=True, remove_punct=True):
    text = clean_text(text, lowercase, remove_punct)
    tokens = word_tokenize(text)
    return [" ".join(gram) for gram in everygrams(tokens, max_len=max_n)]

# Example

In [11]:
text = "In economics, inflation (or less frequently, price inflation) is a general rise in the price level in an economy over a period of time, resulting in a sustained drop in the purchasing power of money. When the general price level rises, each unit of currency buys fewer goods and services; consequently, inflation reflects a reduction in the purchasing power per unit of money – a loss of real value in the medium of exchange and unit of account within the economy. The opposite of inflation is deflation, a sustained decrease in the general price level of goods and services. The common measure of inflation is the inflation rate, the annualized percentage change in a general price index, usually the consumer price index, over time. Economists generally believe that very high rates of inflation and hyperinflation are harmful, and are caused by an excessive growth of the money supply"

all_everygrams = generate_everygrams(text, max_n=4)
all_everygrams

['in',
 'in economics',
 'in economics inflation',
 'in economics inflation or',
 'economics',
 'economics inflation',
 'economics inflation or',
 'economics inflation or less',
 'inflation',
 'inflation or',
 'inflation or less',
 'inflation or less frequently',
 'or',
 'or less',
 'or less frequently',
 'or less frequently price',
 'less',
 'less frequently',
 'less frequently price',
 'less frequently price inflation',
 'frequently',
 'frequently price',
 'frequently price inflation',
 'frequently price inflation is',
 'price',
 'price inflation',
 'price inflation is',
 'price inflation is a',
 'inflation',
 'inflation is',
 'inflation is a',
 'inflation is a general',
 'is',
 'is a',
 'is a general',
 'is a general rise',
 'a',
 'a general',
 'a general rise',
 'a general rise in',
 'general',
 'general rise',
 'general rise in',
 'general rise in the',
 'rise',
 'rise in',
 'rise in the',
 'rise in the price',
 'in',
 'in the',
 'in the price',
 'in the price level',
 'the',
 'th

# N-Gram Frequency

In [12]:
def ngram_frequency(ngrams_list, top_k=10):
    return Counter(ngrams_list).most_common(top_k)

# Example

In [14]:
text = "In economics, inflation (or less frequently, price inflation) is a general rise in the price level in an economy over a period of time, resulting in a sustained drop in the purchasing power of money. When the general price level rises, each unit of currency buys fewer goods and services; consequently, inflation reflects a reduction in the purchasing power per unit of money – a loss of real value in the medium of exchange and unit of account within the economy. The opposite of inflation is deflation, a sustained decrease in the general price level of goods and services. The common measure of inflation is the inflation rate, the annualized percentage change in a general price index, usually the consumer price index, over time. Economists generally believe that very high rates of inflation and hyperinflation are harmful, and are caused by an excessive growth of the money supply"

top_trigrams = ngram_frequency(nltk_trigrams, top_k=10)
top_trigrams

[('in the purchasing', 2),
 ('the purchasing power', 2),
 ('the general price', 2),
 ('general price level', 2),
 ('goods and services', 2),
 ('of inflation is', 2),
 ('in economics inflation', 1),
 ('economics inflation or', 1),
 ('inflation or less', 1),
 ('or less frequently', 1)]