In [1]:
import pandas as pd
import tensorflow_datasets as tfds
import re
from nltk import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')
from collections import defaultdict
from nltk import bigrams, trigrams
import random

def format_and_clean(text):
    text = text.decode('utf-8').lower()
    # clean review
    text = re.sub('<.*?>',' ',text)
    text = re.sub('[,!¡¿?"]', '', text)

    text = sent_tokenize(text)
    text = [word_tokenize(s) for s in text]
    #text = [x for s in text for x in s if x.isalpha()]

    return text

[nltk_data] Downloading package punkt to /home/ddellera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Trabajando con el corpus entero de IMDB

# Split the training set into 60% and 40%, so we'll end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

(train_data, validation_data), test_data = tfds.load(
    name="imdb_reviews", 
    split=(train_validation_split, tfds.Split.TEST),
    as_supervised=True)

df_train = pd.DataFrame(list(tfds.as_numpy(train_data)),columns=['texto','clase'])
df_dev = pd.DataFrame(list(tfds.as_numpy(validation_data)),columns=['texto','clase'])
df_test = pd.DataFrame(list(tfds.as_numpy(test_data)),columns=['texto','clase'])

X_train_text = list(df_train.texto.apply(format_and_clean))
corpusIMDB = sum(X_train_text,[])

X_dev_text = list(df_dev.texto.apply(format_and_clean))
corpusIMDB_dev = sum(X_dev_text,[])


In [3]:
# Trabajando con corpus mas chicos

nltk.download('brown')
nltk.download('reuters')
nltk.download('inaugural')
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.corpus import inaugural

corpusNLTK = brown.sents() + reuters.sents() + inaugural.sents()
corpusBrown = brown.sents() 
corpusReuters = reuters.sents() 
corpusInagural = inaugural.sents()


[nltk_data] Downloading package brown to /home/ddellera/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package reuters to /home/ddellera/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package inaugural to
[nltk_data]     /home/ddellera/nltk_data...
[nltk_data]   Unzipping corpora/inaugural.zip.


In [4]:
def train3gram(corpus):
      # Create a placeholder for model
      model3 = defaultdict(lambda: defaultdict(lambda: 0))

      # Count frequency of co-occurance 

      for s in corpus:
          for w1, w2, w3 in trigrams(s, pad_right=True, pad_left=True):
              model3[(w1, w2)][w3] += 1

      for w1_w2 in model3:
          total_count = float(sum(model3[w1_w2].values()))
          for w3 in model3[w1_w2]:
              model3[w1_w2][w3] /= total_count

      return model3

def generateSentence(model, start):
      # starting words
      text = start
      sentence_finished = False
      
      while not sentence_finished:
        # select a random probability threshold  
        r = random.random()
        accumulator = .0

        h = tuple(text[-2:])

        for w in model[h].keys():
            accumulator += model[h][w]
            # select words that are above the probability threshold
            if accumulator >= r:
                text.append(w)
                break

        if text[-2:] == [None, None] or len(text) == 15:
            sentence_finished = True
      
      print (' '.join([t for t in text if t]))

In [5]:
modelIMDB = train3gram(corpusIMDB)
modelNLTK = train3gram(corpusNLTK)
modelBrown = train3gram(corpusBrown)
modelReuters = train3gram(corpusReuters)
modelInagural = train3gram(corpusInagural)

generateSentence(modelIMDB, ['the','man'])
generateSentence(modelNLTK, ['the','man'])
generateSentence(modelBrown, ['the','man'])
generateSentence(modelReuters, ['the','man'])
generateSentence(modelInagural, ['the','man'])


the man and petty thief from the uk the original one but the distributors only
the man , well , what is possibly no more water '' .
the man had `` more monei presente much might be culturally or geographically determined .
the man who likes to watch closely various developments resulting from continued weakness in the
the man you have called by the late war , but powerful , with full


In [6]:
# Calculemos Perplexity
import numpy as np

def perplexity(corpus,model):
    probs = []
    for s in corpus:
        for w1, w2, w3 in trigrams(s, pad_right=True, pad_left=True):
            p = model[(w1, w2)][w3]
            p_smoothed = p + 0.0000001
            logp = np.log2(p_smoothed)  
            probs.append(logp)

    pp = np.exp2(-np.mean(probs))
    return pp

print(perplexity(corpusIMDB_dev, modelIMDB))
print(perplexity(corpusIMDB_dev, modelNLTK))

10143.084179100537
335070.5936160355
