In [1]:
%%bash
rm -rf 6864-hw1
git clone https://github.com/lingo-mit/6864-hw1.git

Cloning into '6864-hw1'...


In [2]:
import sys
sys.path.append('/content/6864-hw1')

import numpy as np
import csv
import itertools as it
np.random.seed(0)

import lab_util

# 01.dataset

In [3]:
data = []
num_positive = 0 #data balance
num_display = 0 #for checking

with open('/content/6864-hw1/reviews.csv') as reader:
  datareader = csv.reader(reader)
  next(datareader)

  for id, review, label in datareader:
    label = int(label)

    if label == 1:
      if num_positive == 2000:
        continue
      num_positive += 1

    if len(data) == 4000:
      break

    data.append((review, label))

    #display the first 5 data to check
    if num_display > 5:
      continue
    
    num_display += 1
    print("--------------------")
    print("review:", review)
    print("label:", label)
  

  print("total num of reviews:", len(data))

--------------------
review: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
label: 1
--------------------
review: Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
label: 0
--------------------
review: This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that 

In [4]:
np.random.shuffle(data)
reviews, labels = zip(*data)

#Q: test data/train data-->balance?
xtr = reviews[:3000]
ytr = labels[:3000]
xval = reviews[3000:3500]
yval = labels[3000:3500]
xte = reviews[3500:]
yte = labels[3500:]

# 02. word representation(method01. matrix factorization)

In [5]:
vectorizer = lab_util.CountVectorizer()
vectorizer.fit(xtr)
td_matrix = vectorizer.transform(xtr).T
print(f"TD matrix is {td_matrix.shape[0]} x {td_matrix.shape[1]}")

TD matrix is 2006 x 3000


In [6]:
from sklearn.decomposition import TruncatedSVD

In [10]:
def learn_reps_lsa(matrix, rep_size):
  # `matrix` is a `|V| x n` matrix, where `|V|` is the number of words in the
  # vocabulary. This function should return a `|V| x rep_size` matrix with each
  # row corresponding to a word representation. The `sklearn.decomposition` 
  # package may be useful.

  # Your code here!
  svd_model = TruncatedSVD(n_components=rep_size)
  lsa = svd_model.fit_transform(matrix)

  return lsa

In [11]:
reps = learn_reps_lsa(td_matrix, 500)
words = ["good", "bad", "cookie", "jelly", "dog", "the", "4"]
show_tokens = [vectorizer.tokenizer.word_to_token[word] for word in words]
lab_util.show_similar_words(vectorizer.tokenizer, reps, show_tokens)

good 47
  . 1.056
  a 1.101
  but 1.121
  , 1.152
  the 1.157
bad 201
  . 1.396
  taste 1.416
  but 1.434
  a 1.435
  i 1.449
cookie 504
  nana's 0.775
  cookies 1.012
  oreos 1.283
  bars 1.359
  bites 1.380
jelly 351
  twist 1.160
  cardboard 1.259
  plastic 1.427
  advertised 1.428
  peanuts 1.461
dog 925
  food 1.049
  pets 1.071
  pet 1.072
  switched 1.206
  foods 1.228
the 36
  . 0.331
  <unk> 0.366
  of 0.395
  and 0.403
  to 0.422
4 292
  1 1.048
  6 1.118
  70 1.132
  stevia 1.196
  concentrated 1.240


In [24]:
from sklearn.feature_extraction.text import TfidfTransformer

In [25]:
def transform_tfidf(matrix):
  # `matrix` is a `|V| x |D|` matrix of raw counts, where `|V|` is the 
  # vocabulary size and `|D|` is the number of documents in the corpus. This
  # function should (nondestructively) return a version of `matrix` with the
  # TF-IDF transform appliied.

  # Your code here!
  tf_idf = TfidfTransformer()
  return tf_idf.fit_transform(matrix)

In [26]:
td_matrix_tfidf = transform_tfidf(td_matrix)
reps_tfidf = learn_reps_lsa(td_matrix_tfidf, 500)
lab_util.show_similar_words(vectorizer.tokenizer, reps_tfidf, show_tokens)

good 47
  . 0.835
  but 0.842
  a 0.856
  is 0.906
  and 0.911
bad 201
  taste 1.190
  like 1.228
  but 1.251
  . 1.267
  a 1.294
cookie 504
  nana's 0.713
  cookies 0.955
  bars 1.276
  oreos 1.458
  gluten 1.467
jelly 351
  cardboard 1.223
  twist 1.239
  softer 1.351
  plum 1.413
  supermarket 1.489
dog 925
  food 0.859
  pet 1.039
  foods 1.055
  dogs 1.100
  pets 1.100
the 36
  . 0.199
  and 0.254
  of 0.261
  <unk> 0.276
  to 0.306
4 292
  1 0.833
  6 0.976
  2 1.079
  stevia 1.091
  3 1.113


In [28]:
print(td_matrix.shape)
print(reps_tfidf.shape)

(2006, 3000)
(2006, 500)


In [38]:
def word_featurizer(xs):
  # normalize
  return xs / np.sqrt((xs ** 2).sum(axis=1, keepdims=True))

def lsa_featurizer(xs):
  # This function takes in a matrix in which each row contains the word counts
  # for the given review. It should return a matrix in which each row contains
  # the learned feature representation of each review (e.g. the sum of LSA 
  # word representations).

  feats = None # Your code here!
  td_matrix_tfidf = transform_tfidf(xs)
  feats = learn_reps_lsa(td_matrix_tfidf, 500)
  # feats = learn_reps_lsa(td_matrix_tfidf, 500).sum(axis = 1)


  # normalize
  return feats / np.sqrt((feats ** 2).sum(axis=1, keepdims=True))


In [39]:

def combo_featurizer(xs):
  return np.concatenate((word_featurizer(xs), lsa_featurizer(xs)), axis=1)

def train_model(featurizer, xs, ys):
  import sklearn.linear_model
  xs_featurized = featurizer(xs)
  model = sklearn.linear_model.LogisticRegression()
  model.fit(xs_featurized, ys)
  return model

def eval_model(model, featurizer, xs, ys):
  xs_featurized = featurizer(xs)
  pred_ys = model.predict(xs_featurized)
  print("test accuracy", np.mean(pred_ys == ys))

def training_experiment(name, featurizer, n_train):
  print(f"{name} features, {n_train} examples")
  train_xs = vectorizer.transform(xtr[:n_train])
  train_ys = ytr[:n_train]
  test_xs = vectorizer.transform(xte)
  test_ys = yte
  model = train_model(featurizer, train_xs, train_ys)
  eval_model(model, featurizer, test_xs, test_ys)
  print()

training_experiment("word", word_featurizer, 10)
training_experiment("lsa", lsa_featurizer, 10)
training_experiment("combo", combo_featurizer, 10)

word features, 10 examples
test accuracy 0.496

lsa features, 10 examples


ValueError: ignored

# method02. via language modeling

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torch_data

class Word2VecModel(nn.Module):
  # A torch module implementing a word2vec predictor. The `forward` function
  # should take a batch of context word ids as input and predict the word 
  # in the middle of the context as output, as in the CBOW model from lecture.

  def __init__(self, vocab_size, embed_dim):
      super().__init__()

      # Your code here!

  def forward(self, context):
      # Context is an `n_batch x n_context` matrix of integer word ids
      # this function should return a set of scores for predicting the word 
      # in the middle of the context

      # Your code here!

In [None]:
def learn_reps_word2vec(corpus, window_size, rep_size, n_epochs, n_batch):
  # This method takes in a corpus of training sentences. It returns a matrix of
  # word embeddings with the same structure as used in the previous section of 
  # the assignment. (You can extract this matrix from the parameters of the 
  # Word2VecModel.)

  tokenizer = lab_util.Tokenizer()
  tokenizer.fit(corpus)
  tokenized_corpus = tokenizer.tokenize(corpus)

  ngrams = lab_util.get_ngrams(tokenized_corpus, window_size)

  device = torch.device('cuda')  # run on colab gpu
  model = Word2VecModel(tokenizer.vocab_size, rep_size).to(device)
  opt = optim.Adam(model.parameters(), lr=0.001)
  loss_fn = None # Your code here

  loader = torch_data.DataLoader(ngrams, batch_size=n_batch, shuffle=True)

  for epoch in range(n_epochs):
    for context, label in loader:
      # as described above, `context` is a batch of context word ids, and
      # `label` is a batch of predicted word labels
      pass
      # Your code here!

  # reminder: you want to return a `vocab_size x embedding_size` numpy array
  embedding_matrix = None
  # Your code here!

In [None]:
reps_word2vec = learn_reps_word2vec(train_reviews, 2, 500, 10, 100)

In [None]:
lab_util.show_similar_words(vectorizer.tokenizer, reps_word2vec, show_tokens)

In [None]:
from sklearn.cluster import KMeans

indices = KMeans(n_clusters=10).fit_predict(reps_word2vec)
zipped = list(zip(range(vectorizer.tokenizer.vocab_size), indices))
np.random.shuffle(zipped)
zipped = zipped[:100]
zipped = sorted(zipped, key=lambda x: x[1])
for token, cluster_idx in zipped:
  word = vectorizer.tokenizer.token_to_word[token]
  print(f"{word}: {cluster_idx}")

In [None]:
def lsa_featurizer(xs):
  feats = None # Your code here!

  # normalize
  return feats / np.sqrt((feats ** 2).sum(axis=1, keepdims=True))

training_experiment("word2vec", lsa_featurizer, 10)