#  <span style='color:indianred'>  Sentiment Analysis of Trump-related Tweets </span>

##  <span style='color:tomato'>  1. Load required packages </span>

In [1]:
library(lattice)
library(Matrix)
library(caret)
library(dplyr)
library(magrittr)
library(tidyr)
library(tidytext)
library(rtweet)

Loading required package: ggplot2
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
“package ‘dplyr’ was built under R version 3.6.3”
Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

“package ‘tidyr’ was built under R version 3.6.3”
Attaching package: ‘tidyr’

The following object is masked from ‘package:magrittr’:

    extract

The following objects are masked from ‘package:Matrix’:

    expand, pack, unpack



##  <span style='color:tomato'>  2. Load dataset  </span>
**rtweet library documentation:** https://www.rdocumentation.org/packages/rtweet/versions/0.7.0 

In [2]:
tweets_df <- search_tweets("@realdonaldtrump", n = 2000, include_rts = TRUE)

Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp
“Rate limit exceeded”

In [3]:
tweets_df

##  <span style='color:tomato'>  3. Prepare Dataframe and Check for NA </span>

In [4]:
tweets_df$text <- as.character(tweets_df$text)
sapply(tweets_df, function(x) sum(is.na(x)))

##  <span style='color:tomato'>  4. Investigate Variables </span>

In [None]:
skimr::skim(tweets_df)

##  <span style='color:tomato'>  5. Tokenize by Word </span>

In [None]:
tweets_df.tidy <- tidytext::unnest_tokens(tweets_df, word, text)
head(dplyr::count(tweets_df.tidy, word, sort = TRUE),10)

##  <span style='color:tomato'>  6. Remove Stopwords </span>

In [None]:
tweets_df.clean <- dplyr::anti_join(tweets_df.tidy, tidytext::get_stopwords())
tweets_df.clean <- tweets_df.clean[which(nchar(tweets_df.clean$word) > 2 &
                                         tweets_df.clean$word != "http" & 
                                         tweets_df.clean$word != "https" &
                                         tweets_df.clean$word != "trump" &
                                         tweets_df.clean$word != "trump's" &
                                         tweets_df.clean$word != "donald" &
                                         tweets_df.clean$word != "amp" &
                                         tweets_df.clean$word != "justice" &
                                         tweets_df.clean$word != "michael" &
                                         tweets_df.clean$word != "flynn" &
                                         tweets_df.clean$word != "general" &
                                         tweets_df.clean$word != "yeah" &
                                         tweets_df.clean$word != "realdonaldtrump" &
                                         tweets_df.clean$word != "president"),]
tweets_df.count <- dplyr::count(tweets_df.clean, word, sort = TRUE)
head(tweets_df.count, 10)

##  <span style='color:tomato'>  7. Visualize most common words </span>

In [None]:
tweets_df.count$word <- reorder(tweets_df.count$word, tweets_df.count$n)

ggplot2::ggplot(head(tweets_df.count, 20), ggplot2::aes(x = word, y = n)) +
  ggplot2::geom_col() +
  ggplot2::coord_flip() +
  ggpubr::theme_pubclean()

##  <span style='color:tomato'>  8. Visualize as Word Cloud </span>

In [None]:
wordcloud::wordcloud(tweets_df.count$word, tweets_df.count$n, min.freq = 10, max.words = 100, random.order=FALSE)

##  <span style='color:tomato'>  9. Compute TF-IDF </span>
We will use screen_name as document in this case. You could also use the tweet (i.e., text) instead. 

In [None]:
tweets_df.count <- dplyr::count(tweets_df.clean, screen_name, word, sort = TRUE) 
head(tidytext::bind_tf_idf(tweets_df.count, word, screen_name, n),10)

##  <span style='color:tomato'>  10. Join Sentiment Dictionaries and Visualize Sentiment Counts </span>

In [None]:
tweets_df.sen <- dplyr::inner_join(tweets_df.clean, tidytext::get_sentiments("nrc"), by = "word")
tweets_df.sen <- dplyr::inner_join(tweets_df.sen, tidytext::get_sentiments("afinn"), by = "word")
tweets_df.sen_count <- dplyr::count(tweets_df.sen, sentiment, word, sort = TRUE)
tweets_df.sen_count$word <- reorder(tweets_df.sen_count$word, tweets_df.sen_count$n)
tweets_df.sen_count <- by(tweets_df.sen_count, tweets_df.sen_count["sentiment"], head, n=5)
tweets_df.sen_count <- Reduce(rbind, tweets_df.sen_count)
head(tweets_df.sen[,c('word','sentiment')],10)

In [None]:
ggplot2::ggplot(tweets_df.sen_count, ggplot2::aes(x = word, y = n, fill = sentiment)) +
  ggplot2::geom_col(show.legend = FALSE) +
  ggplot2::facet_wrap(~sentiment, scales = "free") +
  ggplot2::labs(y = "Contribution to sentiment", x = NULL) +
  ggplot2::coord_flip() +
  ggpubr::theme_pubclean()

##  <span style='color:tomato'>  11. Visualize Sentiment Analysis </span>

In [None]:
tweets_df.sen_count <- aggregate(n ~ sentiment, tweets_df.sen_count, sum)
tweets_df.sen_count$sentiment <- reorder(tweets_df.sen_count$sentiment, tweets_df.sen_count$n)

ggplot2::ggplot(tweets_df.sen_count, ggplot2::aes(x = sentiment, y = n, fill = sentiment)) +
  ggplot2::geom_col(show.legend = FALSE) +
  ggplot2::coord_flip() +
  ggpubr::theme_pubclean()

#  <span style='color:tomato'>  12. Compute Word Pairs and Correlations </span>

In [None]:
word_pair <- widyr::pairwise_count(tweets_df.clean, word, screen_name, sort = TRUE)
head(word_pair, 10)

In [None]:
word_cor <- widyr::pairwise_cor(tweets_df.clean[sample(nrow(tweets_df.clean), 1000),], word, screen_name, sort = TRUE)
correlations <- word_cor[which(word_cor$correlation != 1),]
head(correlations,10)

In [None]:
result <- filter(correlations, item1 == "apologize")
head(result,5)