In [1]:
library(text2vec)
library(glmnet)
library(slam)
library(pROC)

Loading required package: Matrix

Loaded glmnet 4.1-4

Type 'citation("pROC")' for a citation.


Attaching package: 'pROC'


The following objects are masked from 'package:stats':

    cov, smooth, var




We first load in the entire dataset to create an inital document term matrix.

In [None]:
# load all data and clean the html tags.
train = read.table("alldata.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)
train$review = gsub('<.*?>', ' ', train$review)

# construct DT (DocumentTerm) matrix (maximum 4-grams).
stop_words = c("i", "me", "my", "myself", 
               "we", "our", "ours", "ourselves", 
               "you", "your", "yours", 
               "their", "they", "his", "her", 
               "she", "he", "a", "an", "and",
               "is", "was", "are", "were", 
               "him", "himself", "has", "have", 
               "it", "its", "the", "us")
it_train = itoken(train$review,
                  preprocessor = tolower, 
                  tokenizer = word_tokenizer)
tmp.vocab = create_vocabulary(it_train, 
                              stopwords = stop_words, 
                              ngram = c(1L,4L))
tmp.vocab = prune_vocabulary(tmp.vocab, term_count_min = 10,
                             doc_proportion_max = 0.5,
                             doc_proportion_min = 0.001)
dtm_train  = create_dtm(it_train, vocab_vectorizer(tmp.vocab))

We then use the t-test method to trim our vocuabulary to 2K.

(describe this method)

In [None]:
# trim the vocabulary size to 2K using t-test
v.size = dim(dtm_train)[2]
ytrain = train$sentiment

summ = matrix(0, nrow=v.size, ncol=4)
summ[,1] = colapply_simple_triplet_matrix(
  as.simple_triplet_matrix(dtm_train[ytrain==1, ]), mean)
summ[,2] = colapply_simple_triplet_matrix(
  as.simple_triplet_matrix(dtm_train[ytrain==1, ]), var)
summ[,3] = colapply_simple_triplet_matrix(
  as.simple_triplet_matrix(dtm_train[ytrain==0, ]), mean)
summ[,4] = colapply_simple_triplet_matrix(
  as.simple_triplet_matrix(dtm_train[ytrain==0, ]), var)

n1 = sum(ytrain); 
n = length(ytrain)
n0 = n - n1

myp = (summ[,1] - summ[,3])/
  sqrt(summ[,2]/n1 + summ[,4]/n0)

words = colnames(dtm_train)
id = order(abs(myp), decreasing=TRUE)[1:2000]
#pos.list = words[id[myp[id]>0]]
#neg.list = words[id[myp[id]<0]]
words = words[id]

Create a new vectorizer based on the reduced vocabulary, and then use that to create a new DTM based on the reduced vocabulary

In [None]:
vectorizer = vocab_vectorizer(create_vocabulary(words, 
                                                ngram = c(1L, 2L)))
dtm_reduced = create_dtm(it_train, vectorizer)

Run Lasso with logistic regression again, and pick out the columns that give a vocab size of less than 1K

In [None]:
# Lasso (with logistic regression)
set.seed(7568)
tmpfit = glmnet(x = dtm_reduced, 
                y = train$sentiment, 
                alpha = 0.05,
                family='binomial')

In [None]:
# pick the largest df among the beta values thar are less than 1K 
# and store the corresponding words in myvocab
i = 1
while (tmpfit$df[i] <= 1000) {
    i = i+1
}

In [None]:
myvocab = colnames(dtm_reduced)[which(tmpfit$beta[, i-1] != 0)]

Let's check this vocab against the training data:

In [None]:
vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                  ngram = c(1L, 2L)))
dtm_test = create_dtm(it_train, vectorizer)

testfit = glmnet(x = dtm_test, 
                 y = train$sentiment, 
                 alpha = 0.05,
                 family='binomial')

pred = predict(testfit, newx=dtm_test, s=0.01)

roc_obj <- roc(train$sentiment, c(pred))
pROC::auc(roc_obj)

Just to be safe, let's also test the splits. First we create them:

In [None]:
data <- read.table("alldata.tsv", stringsAsFactors = FALSE,
                  header = TRUE)
testIDs <- read.csv("project3_splits.csv", header = TRUE)
for(j in 1:5){
  dir.create(paste("split_", j, sep=""))
  train <- data[-testIDs[,j], c("id", "sentiment", "review") ]
  test <- data[testIDs[,j], c("id", "review")]
  test.y <- data[testIDs[,j], c("id", "sentiment", "score")]
  
  tmp_file_name <- paste("split_", j, "/", "train.tsv", sep="")
  write.table(train, file=tmp_file_name, 
              quote=TRUE, 
              row.names = FALSE,
              sep='\t')
  tmp_file_name <- paste("split_", j, "/", "test.tsv", sep="")
  write.table(test, file=tmp_file_name, 
              quote=TRUE, 
              row.names = FALSE,
              sep='\t')
  tmp_file_name <- paste("split_", j, "/", "test_y.tsv", sep="")
  write.table(test.y, file=tmp_file_name, 
            quote=TRUE, 
            row.names = FALSE,
            sep='\t')
}

And now we test our vocab on each split

In [None]:
# this loop is for our purposes, we will not submit it
scores = {}
for (j in 1:5) {
    setwd(paste("split_", j, sep=""))
    # below this comment is submitted code unless otherwise stated
    train = read.table("train.tsv",
                       stringsAsFactors = FALSE,
                       header = TRUE)
    train$review <- gsub('<.*?>', ' ', train$review)

    # Create matrix corresponding to vocab
    it_train = itoken(train$review,
                        preprocessor = tolower, 
                        tokenizer = word_tokenizer)
    vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                      ngram = c(1L, 2L)))
    dtm_train = create_dtm(it_train, vectorizer)
    set.seed(7568)
    fit = glmnet(x = dtm_train, 
                    y = train$sentiment, 
                    alpha = 0.05,
                    family='binomial')
    
    #####################################
    # Load test data, and 
    # Compute prediction
    #####################################
    test <- read.table("test.tsv", stringsAsFactors = FALSE,
                    header = TRUE)

    test$review <- gsub('<.*?>', ' ', test$review)

    # Create matrix corresponding to vocab
    it_test = itoken(test$review,
                        preprocessor = tolower, 
                        tokenizer = word_tokenizer)
    vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                ngram = c(1L, 2L)))
    dtm_test = create_dtm(it_test, vectorizer)
    
    prob = predict(fit, newx=dtm_test, s=0.05)
    output = data.frame(id=c(test$id),
                        prob=c(prob))
    #####################################
    # Store your prediction for test data in a data frame
    # "output": col 1 is test$id
    #           col 2 is the predicted probs
    #####################################
    write.table(output, file = "mysubmission.txt", 
                row.names = FALSE, sep='\t')
    
    # Below this is not submitted code
    # move "test_y.tsv" to this directory
    test.y <- read.table("test_y.tsv", header = TRUE)
    pred <- read.table("mysubmission.txt", header = TRUE)
    pred <- merge(pred, test.y, by="id")
    roc_obj <- roc(pred$sentiment, pred$prob)
    score = pROC::auc(roc_obj)
    scores[j] = score
    setwd('../')
    }
scores

It looks good, so we write it to our file!

In [None]:
# Write vocab to text file
some.strs <- c(myvocab)
write.table(some.strs, file = "myvocab.txt",
            quote = FALSE,
            row.names = FALSE,
            col.names = FALSE,
            sep = "\n")