In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

import nltk
import random
import numpy as np

from bs4 import BeautifulSoup

In [7]:
# Load the reviews
positive_reviews = BeautifulSoup(open('data/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

# Extract trigrams and insert into dictionary
# (w1, w3) is the key, [ w2 ] are the values
trigrams = {}
for review in positive_reviews:
    s = review.text.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        k = (tokens[i], tokens[i+2])
        if k not in trigrams:
            trigrams[k] = []
        trigrams[k].append(tokens[i+1])

# Turn each array of middle-words into a probability vector
trigram_probabilities = {}
for k, words in iteritems(trigrams):
    # create a dictionary of word -> count
    if len(set(words)) > 1:
        # only do this when there are different possibilities for a middle word
        d = {}
        n = 0
        for w in words:
            if w not in d:
                d[w] = 0
            d[w] += 1
            n += 1
        for w, c in iteritems(d):
            d[w] = float(c) / n
        trigram_probabilities[k] = d


def random_sample(d):
    # choose a random sample from dictionary where values are the probabilities
    r = random.random()
    cumulative = 0
    for w, p in iteritems(d):
        cumulative += p
        if r < cumulative:
            return w


def test_spinner():
    review = random.choice(positive_reviews)
    s = review.text.lower()
    print("Original:", s)
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        if random.random() < 0.2: # 20% chance of replacement
            k = (tokens[i], tokens[i+2])
            if k in trigram_probabilities:
                w = random_sample(trigram_probabilities[k])
                tokens[i+1] = w
    print("Spun:")
    print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))


if __name__ == '__main__':
    test_spinner()

Original: 
i can't believe the unbelievable prices of flash drives these days.  i just bought this one from wal-mart for $20.00 dollars.  that's way cheaper than on amazon.com.  plus i've noticed that amazon.com has switched to third retailers and some don't even ship to hawaii, or do ship to hawaii at a hefty price.  this switching of amazon having the lowest price to third retailers is really turning me off.  possibly even towards purchasing books. 

Spun:
i do n't used the unbelievable prices of flash drives these days. i also bought only one from wal-mart for $20.00 dollars. that's way cheaper than on amazon.com. plus i have noticed that amazon.com has switched to third retailers and some do n't even ship to hawaii, or do have to hawaii at a hefty price. this brand of amazon having the lowest price to third retailers is really turning me off. possibly even towards purchasing books.
