In [1]:
library(text2vec)
library(glmnet)
library(slam)

Loading required package: Matrix

Loaded glmnet 4.1-4



### Split generation

In [1]:
data <- read.table("alldata.tsv", stringsAsFactors = FALSE,
                  header = TRUE)
testIDs <- read.csv("project3_splits.csv", header = TRUE)
for(j in 1:5){
  dir.create(paste("split_", j, sep=""))
  train <- data[-testIDs[,j], c("id", "sentiment", "review") ]
  test <- data[testIDs[,j], c("id", "review")]
  test.y <- data[testIDs[,j], c("id", "sentiment", "score")]
  
  tmp_file_name <- paste("split_", j, "/", "train.tsv", sep="")
  write.table(train, file=tmp_file_name, 
              quote=TRUE, 
              row.names = FALSE,
              sep='\t')
  tmp_file_name <- paste("split_", j, "/", "test.tsv", sep="")
  write.table(test, file=tmp_file_name, 
              quote=TRUE, 
              row.names = FALSE,
              sep='\t')
  tmp_file_name <- paste("split_", j, "/", "test_y.tsv", sep="")
  write.table(test.y, file=tmp_file_name, 
            quote=TRUE, 
            row.names = FALSE,
            sep='\t')
}

### Generate vocabulary

In [2]:
# load the training data and clean the html tags.
#j = 1
#setwd(paste("split_", j, sep=""))
#train = read.table("train.tsv",
train = read.table("alldata.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)
train$review = gsub('<.*?>', ' ', train$review)

In [3]:
# construct DT (DocumentTerm) matrix (maximum 4-grams).
stop_words = c("i", "me", "my", "myself", 
               "we", "our", "ours", "ourselves", 
               "you", "your", "yours", 
               "their", "they", "his", "her", 
               "she", "he", "a", "an", "and",
               "is", "was", "are", "were", 
               "him", "himself", "has", "have", 
               "it", "its", "the", "us")
it_train = itoken(train$review,
                  preprocessor = tolower, 
                  tokenizer = word_tokenizer)
tmp.vocab = create_vocabulary(it_train, 
                              stopwords = stop_words, 
                              ngram = c(1L,4L))
tmp.vocab = prune_vocabulary(tmp.vocab, term_count_min = 10,
                             doc_proportion_max = 0.5,
                             doc_proportion_min = 0.001)
dtm_train  = create_dtm(it_train, vocab_vectorizer(tmp.vocab))

In [4]:
# use Lasso (with logistic regression) to trim the vocabulary size to 2K.
set.seed(7568)
tmpfit = glmnet(x = dtm_train, 
                y = train$sentiment, 
                alpha = 1,
                family='binomial')
tmpfit$df

In [5]:
# picke the largest df among the beta values thar are less than 2K 
#(which turns out to be the 44th column), and store the corresponding words in myvocab
myvocab = colnames(dtm_train)[which(tmpfit$beta[, 44] != 0)]

In [6]:
# use this customized vocabulary with ridge regression on the five splits
#train = read.table("train.tsv",
train = read.table("alldata.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)
 train$review <- gsub('<.*?>', ' ', train$review)
 it_train = itoken(train$review,
                    preprocessor = tolower, 
                    tokenizer = word_tokenizer)
 vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                  ngram = c(1L, 2L)))
 dtm_train = create_dtm(it_train, vectorizer)

In [7]:
# use Lasso (with logistic regression) to trim the vocabulary size to 2K.
v.size = dim(dtm_train)[2]
ytrain = train$sentiment

summ = matrix(0, nrow=v.size, ncol=4)
summ[,1] = colapply_simple_triplet_matrix(
  as.simple_triplet_matrix(dtm_train[ytrain==1, ]), mean)
summ[,2] = colapply_simple_triplet_matrix(
  as.simple_triplet_matrix(dtm_train[ytrain==1, ]), var)
summ[,3] = colapply_simple_triplet_matrix(
  as.simple_triplet_matrix(dtm_train[ytrain==0, ]), mean)
summ[,4] = colapply_simple_triplet_matrix(
  as.simple_triplet_matrix(dtm_train[ytrain==0, ]), var)

n1 = sum(ytrain); 
n = length(ytrain)
n0 = n - n1

myp = (summ[,1] - summ[,3])/
  sqrt(summ[,2]/n1 + summ[,4]/n0)

In [8]:
# order words by the magnitude of their t-statistics and
# pick the top 2000 words, which are then divided into two 
# lists: positive words and negative words.
words = colnames(dtm_train)
id = order(abs(myp), decreasing=TRUE)[1:2000]
pos.list = words[id[myp[id]>0]]
neg.list = words[id[myp[id]<0]]

In [22]:
pos.list[1:50]

In [21]:
neg.list[1:50]

In [15]:
# check which words never appear in the positive reviews 
# and which never appear in the negative reviews.
id1 = which(summ[, 2] == 0) # same as: which(summ[id0, 1] != 0)
id0 = which(summ[, 4] == 0) #same as: which(summ[id1, 3] != 0)
words[id1]
words[id0]

In [16]:
words[id0[! (id0 %in% id)]]
words[id1[! (id1 %in% id)]]

In [30]:
# Write vocab to text file
some.strs <- c(words)
write.table(some.strs, file = "myvocab.txt",
            quote = FALSE,
            row.names = FALSE,
            col.names = FALSE,
            sep = "\n")

### Submitted code

In [69]:
#myvocab <- scan(file = "myvocab.txt", what = character())
#j=1
#setwd(paste("split_", j, sep=""))
train = read.table("train.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)
 train$review <- gsub('<.*?>', ' ', train$review)

# Create matrix corresponding to vocab
 it_train = itoken(train$review,
                    preprocessor = tolower, 
                    tokenizer = word_tokenizer)
 vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                  ngram = c(1L, 2L)))
 dtm_train = create_dtm(it_train, vectorizer)

In [70]:
set.seed(7568)
tmpfit = glmnet(x = dtm_train, 
                y = train$sentiment, 
                alpha = 0.01,
                family='binomial')
tmpfit$df

In [71]:
#####################################
# Load test data, and 
# Compute prediction
#####################################
test <- read.table("test.tsv", stringsAsFactors = FALSE,
                    header = TRUE)

 test$review <- gsub('<.*?>', ' ', test$review)

# Create matrix corresponding to vocab
 it_test = itoken(test$review,
                    preprocessor = tolower, 
                    tokenizer = word_tokenizer)
 vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                  ngram = c(1L, 2L)))
 dtm_test = create_dtm(it_test, vectorizer)

In [72]:
predict(tmpfit, newx=dtm_test, s=0.01)

Unnamed: 0,s1
1,1.5828108
2,-3.6165582
3,0.1407497
4,3.5482136
5,-11.3678062
6,-6.1666739
7,2.7572158
8,-1.4156575
9,4.2640076
10,-16.9599254
