In [None]:
library(tidyverse)
library(tidytext)
library(tm)
library(SnowballC)
library(dplyr)
library(stringr)
library(text2vec)
library(nnet)
library(caret)
library(FSelector)
library(textTinyR)

In [None]:
data <- read_csv("~/final raw data/raw_data.csv")

In [None]:
custom_stop_words <- c(
    "a", "an", "the", "am", "is", "are", "was", "were", "be", "being", "been",
    "i", "im", "me", "my", "mine", "myself",
    "you", "your", "yours", "yourself",
    "he", "him", "his", "himself",
    "she", "her", "hers", "herself",
    "it", "its", "itself",
    "we", "us", "our", "ours", "ourselves",
    "they", "them", "their", "theirs", "themselves",
    "do", "does", "did",
    "have", "has", "had",
    "will", "would",
    "can", "could",
    "shall", "should", "feel", "ive",
    "may", "might",
    "must",
    "in", "on", "at", "to",
    "of", "from", "with",
    "by", "for", "about",
    "under", "over", "between",
    "through", "during", "within",
    "without", "throughout", "into",
    "onto", "upon",
    "and", "but", "or",
    "nor", "yet", "so",
    "because", "although",
    "unless", "whereas",
    "while", "if",
    "just", "now", "then",
    "here", "there", "where",
    "how", "why", "when",
    "again", "often",
    "sometimes", "usually",
    "this", "that", "these", "those",
    "what", "which", "who", "whom",
    "whose", "where",
    "why", "how", "all", "any",
    "both", "each", "few", "more",
    "most", "other", "some", "such",
    "own", "same", "than",
    "too", "up", "down", "feel"
)

clean_text <- data %>%
  mutate(doc_id = row_number()) %>%
  mutate(text = str_to_lower(text)) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  filter(!word %in% custom_stop_words) %>%
  filter(!str_detect(word, "[:punct:]")) %>%
  mutate(word = wordStem(word, language = "en")) %>%
  group_by(doc_id, label) %>%
  summarise(text = paste(word, collapse = " "), .groups = "drop")


In [None]:
clean_text <- clean_text %>%
  arrange(doc_id) %>%
  select(-doc_id)

In [None]:
write_csv(clean_text, "~/processed_text.csv")

In [None]:
data <- read_csv("~/processed_text.csv")
#data <- data %>% slice(1:5000)
str(data)
sum(is.na(data$text))

In [None]:
prep_fun <- tolower
tok_fun <- word_tokenizer

In [None]:
it <- itoken(data$text, preprocessor = prep_fun, tokenizer = tok_fun, progressbar = FALSE)
vocab <- create_vocabulary(it, stopwords = stopwords("en"))
vocab <- prune_vocabulary(vocab, term_count_min = 3, doc_proportion_max = 0.95)

saveRDS(vocab, "vocabulary.rds")

In [None]:
vocab <- readRDS("vocabulary.rds")

vectorizer <- vocab_vectorizer(vocab)
dtm <- create_dtm(it, vectorizer)

In [None]:
tfidf <- TfIdf$new()
dtm_tfidf <- tfidf$fit_transform(dtm)

In [None]:
dtm_tfidf_df <- as.data.frame(as.matrix(dtm_tfidf))

In [None]:
input_data <- dtm_tfidf_df
output_data <- data$label

dtm_tfidf_selected <- cbind(input_data, label = output_data)

In [None]:
write.csv(dtm_tfidf_selected, "~/dtm_tfidf_selected.csv", row.names = FALSE)