In [19]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

movies <- read.csv("../input/tmdb-movie-metadata/tmdb_5000_movies.csv")


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [20]:
library(keras)
library(tensorflow)
library(tm)
library(spacyr)
library(utf8)
library(tidyverse)
# choose columns of interest
data <- movies %>% select(overview, vote_average)
plots <- data[,1]
ratings <- data[,2]

# check if everything is unicoded
plots[!utf8_valid(plots)]

# check if text is normalized
plots_NFC <- utf8_normalize(plots)
sum(plots_NFC != plots)

# remove punctuation
plots <- gsub('[[:punct:] ]+',' ',plots)

data$overview <- plots

plots[1]


In [21]:
corpus = VCorpus(VectorSource(data$overview))
#Checking the first movie review before Data Cleaning
as.character(corpus[[1]])

corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
as.character(corpus[[1]])

In [22]:
# perform a DTM for term frequency (Data used in the model)
dtm = DocumentTermMatrix(corpus)
dtm
dim(dtm)

In [23]:
dataset = as.data.frame(as.matrix(dtm))

head(dataset)
dim(dataset)

dataset$Class = ratings

# split training and test data
set.seed(222)
split = sample(2,nrow(dataset),prob = c(0.75,0.25),replace = TRUE)
train_set = dataset[split == 1,]
test_set = dataset[split == 2,] 


In [24]:
# tune model lstm
model <- keras_model_sequential()
model %>%
  # Creates dense embedding layer; outputs 3D tensor
  # with shape (batch_size, sequence_length, output_dim)
  layer_embedding(input_dim = 2000,
                  output_dim = 128,
                  input_length = 1000) %>%
  bidirectional(layer_lstm(units = 64)) %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 1, activation = 'relu')


In [25]:
# choose model metrics and loss function
model %>% compile(
  loss = 'mean_squared_error',
  optimizer = 'adam',
  metrics = c('cosine_similarity')
)

In [26]:
# Train model
cat('Train...\n')

train_X <- train_set %>% select(-Class)
test_X <- test_set %>% select(-Class)

model %>% fit(
  train_X, train_set$Class,
  batch_size = 100,
  epochs = 5,
  validation_data = list(test_X, test_set$Class)
)

In [27]:
# predict on test data
text.pred <- predict(model, test_X)

In [45]:
model

In [29]:
# model overall mean squared error
mean((text.pred-test_set$Class)^2)

In [48]:
# test model on new text
test_plot <- "A terrible movie as everyone has said. What made me laugh was the cameo appearance by Scott McNealy"
test_corpus = VCorpus(VectorSource(test_plot))
#Checking the first movie review before Data Cleaning
as.character(test_corpus[[1]])

#corpus = tm_map(corpus, content_transformer(tolower))
test_corpus = tm_map(test_corpus, removeNumbers)
test_corpus = tm_map(test_corpus, removePunctuation)
test_corpus = tm_map(test_corpus, removeWords, stopwords("english"))
test_corpus = tm_map(test_corpus, stemDocument)
test_corpus = tm_map(test_corpus, stripWhitespace)

test_dtm = DocumentTermMatrix(test_corpus)

test_dataset = as.data.frame(as.matrix(test_dtm))
test_dataset


In [44]:
# new plot score
predict(model, test_dataset)