In [None]:
install.packages("keras")
install.packages("tensorflow")
install.packages("caret")

In [None]:
# Load required libraries
library(keras)
library(tidyverse)
library(caret)

In [None]:
# Read the CSV file
data <- read.csv("/kaggle/input/create-embedding-word2vec-from-ggnewsvector/data_with_embeddings.csv")

In [None]:
process_embedding <- function(embedding_str) {
  tryCatch({
    nums <- str_extract_all(embedding_str, "-?\\d+\\.\\d+e?-?\\d*")[[1]]
    if(length(nums) == 300) {
      return(as.numeric(nums))
    } else {
      return(NULL)
    }
  }, error = function(e) {
    return(NULL)
  })
}

embeddings_list <- lapply(data$embedding, process_embedding)

valid_indices <- which(!sapply(embeddings_list, is.null))
print(paste("Number of valid embeddings:", length(valid_indices)))

embeddings_matrix <- do.call(rbind, embeddings_list[valid_indices])
labels <- to_categorical(data$label[valid_indices], num_classes = 6)

# In thông tin sau khi lọc
print(paste("Final embeddings matrix dimension:", paste(dim(embeddings_matrix), collapse = " x ")))
print(paste("Final labels dimension:", paste(dim(labels), collapse = " x ")))

In [None]:
set.seed(123)
train_indices <- createDataPartition(y = valid_indices, p = 0.9, list = FALSE)

X_train <- embeddings_matrix[train_indices,]
X_test <- embeddings_matrix[-train_indices,]
y_train <- labels[train_indices,]
y_test <- labels[-train_indices,]

# In kích thước của tập train/test
print("Training set dimensions:")
print(dim(X_train))
print(dim(y_train))
print("Test set dimensions:")
print(dim(X_test))
print(dim(y_test))

In [None]:
X_train_df <- as.data.frame(X_train)
X_test_df <- as.data.frame(X_test)

X_train_df$label <- apply(y_train, 1, function(x) which(x == 1) - 1)
X_test_df$label <- apply(y_test, 1, function(x) which(x == 1) - 1)

# Save as CSV files
write.csv(X_train_df, "train_data.csv", row.names = FALSE)
write.csv(X_test_df, "test_data.csv", row.names = FALSE)


In [None]:
train_data <- read.csv("train_data.csv")
test_data <- read.csv("test_data.csv")

In [None]:
X_train <- as.matrix(train_data[, -ncol(train_data)])
y_train <- to_categorical(train_data$label, num_classes = 6)

X_test <- as.matrix(test_data[, -ncol(test_data)])
y_test <- to_categorical(test_data$label, num_classes = 6)

In [None]:
model <- keras_model_sequential() %>%
  layer_dense(units = 256, activation = "relu", input_shape = c(300)) %>%
  layer_dropout(rate = 0.2) %>%
  layer_dense(units = 128, activation = "relu") %>%
  layer_dropout(rate = 0.2) %>%
  layer_dense(units = 64, activation = "relu") %>%
  layer_dropout(rate = 0.1) %>%
  layer_dense(units = 6, activation = "softmax")

# Compile
model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_adam(learning_rate = 0.001),
  metrics = c("accuracy")
)

# Early Stopping
early_stopping <- callback_early_stopping(
  monitor = "val_loss", 
  patience = 15,
  restore_best_weights = TRUE
)

# Training
history <- model %>% fit(
  X_train, y_train,
  epochs = 250,
  batch_size = 64,
  validation_split = 0.2,
  callbacks = list(early_stopping)
)

# Evaluate
results <- model %>% evaluate(X_test, y_test)
print(results)

In [None]:
# Save model
save_model_hdf5(model, "sentiment_model.h5")