In [15]:
import numpy as np
import pandas as pd
import nltk
import string
from bs4 import BeautifulSoup

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load data

In [16]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

positive_reviews = BeautifulSoup(open('/content/gdrive/MyDrive/Colab Notebooks/lazyprogrammer/data/electronics/positive.review').read(), features="html5lib")
reviews = [r.text for r in positive_reviews.findAll('review_text')]

len(reviews)

Mounted at /content/gdrive/


1000

# Utils

In [69]:
def tokenize(txt):
  tokens = nltk.tokenize.word_tokenize(txt.lower())
  return tokens


def prity_print(list_of_tokens):
  print(' '.join(list_of_tokens))


def test_spinner(spin_model, change_prob=0.2):
    review = np.random.choice(reviews)
    print("Original:", review.lower())
    tokens = tokenize(review)
    new_tokens = spin_model.generate(tokens, change_prob)
    print("Spun:")
    print(" ".join(new_tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))

In [70]:
class ArticleSpinerLanguageModel():
  # given t-1 anf t+1 tokens, randomly replace according to prob p(x(t) | x(t-1), x(t+1))

  def __init__(self):
    self.A2 = None

  def fit(self, x):
    assert len(x) > 0

    x_A2 = {} # dict((state_t-1, state_t+1), list(state_t))
    for idx, x_cur in enumerate(x):
      for t in range(1, len(x_cur) - 1):
        pair_t = (x_cur[t-1], x_cur[t+1])
        x_A2[pair_t] = x_A2.get(pair_t, [])
        x_A2[pair_t].append(x_cur[t])

    self.A2 = {} # dict((state_t-1, state_t+1), dict(state_t, (state_t-1, state_t+1) -> state_t  transition probability in the whole sequence))
    for pair_t, state_instance in x_A2.items():
      self.A2[pair_t] = self._calc_discrete_distribution(state_instance)

      
  def generate(self, article, change_prob):
    new_article = [article[0]]
    
    # generate tokens >= 2
    for t in range(1, len(article) - 1):
      if np.random.random() < change_prob:
        token_t_distribution = self.A2[(article[t-1], article[t+1])]
        new_token = self._cdf_inv(token_t_distribution)
        new_article.append(new_token)
      else:
        new_article.append(article[t])
    
    new_article.append(article[-1])
    return new_article


  def _calc_discrete_distribution(self, tokens: list):
    n = len(tokens)
    pdf = {}
    for i, token in enumerate(tokens):
      pdf[token] = pdf.get(token, 0) + 1
    
    for k, val in pdf.items():
      pdf[k] = val / n   
    return pdf

    
  def _cdf_inv(self, discrete_distribution: dict):
    u = np.random.random()
    cdf = 0.0
    random_token = None
    for token, prob in discrete_distribution.items():
      cdf += prob
      if u < cdf:
        return token
    raise Exception('Unexpected line execution. Probably provided discrete distribution was not correct')

# Use ArticleSpinerLanguageModel to change article 

In [71]:
X_train = [tokenize(r) for r in reviews]
_ = [prity_print(t) for t in X_train[:3]]

i purchased this unit due to frequent blackouts in my area and 2 power supplies going bad . it will run my cable modem , router , pc , and lcd monitor for 5 minutes . this is more than enough time to save work and shut down . equally important , i know that my electronics are receiving clean power . i feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply . as always , amazon had it to me in < 2 business days
i ordered 3 apc back-ups es 500s on the recommendation of an employee of mine who used to work at apc . i 've had them for about a month now without any problems . they 've functioned properly through a few unexpected power interruptions . i 'll gladly order more if the need arises . pros : - large plug spacing , good for power adapters - simple design - long cord cons : - no line conditioning ( usually an expensive option


In [74]:
spin_model = ArticleSpinerLanguageModel()
spin_model.fit(X_train)

In [75]:
test_spinner(spin_model)

Original: 
i was looking for an inexpensive, yet high quality, long lasting, and durable way to identify dozens upon dozens of electrical wires behind the instrument panel of my airplane. a fellow pilot made me aware of the family of small electronic labelers available in the marketplace. the brother 1750 fits my needs perfectly. offering two lines of crisp, highly readable print gives a professional appearance to all labeling needs. my experience is quite different than some reviewers who have complained about wasted tape. in fact, i found that little wasted tape is generated if you simply adjust the margins to a particular task and this option is clearly spelled out in the instruction booklet. since i purchased the 1750, i have found many uses for it around the home office and i wonder how i got by without one in the past! my only complaint about the keyboard is that among the characters, it does not offer a dash or hyphen, so when one is  required, i find myself using a star instead

In [76]:
test_spinner(spin_model)

Original: 
seems nowadays everything uses aaa batteries so this 30 pack is perfect for just about anyone that needs them. you know who you are. the ones with the video games, remote controls, clocks, and whatever else you need. expecially good to get during the holidays, you should never be without

Spun:
seems nowadays everything uses aaa batteries so this 30 pack is perfect for just about anyone that needs perfectly. you know who you need. the ones with the new games, remote controls, confortable, and anything else you purchase. expecially good to get out the box, you could n't be without


In [77]:
test_spinner(spin_model)

Original: 
this mouse works perfectly fast and easy without problems even if you don't install the software.
if you need a cordless mouse this is what you have to bu

Spun:
this mouse works perfectly fast and music without problems even if they do n't install the noises. if you need a cordless mouse this is what you have to bu
