# 1. Get tweets from the BTW17 collection

**In this step, a list of userids is used to get each user's tweets from the BTW17 mongoDB collection. Each dataframe is stored in a directory.**

## Libraries

In [None]:
library(tidyverse)
library(mongolite)
library(jsonlite)

mongo_options(bigint_as_char = TRUE)

## Parameters

### Userids, Destination folder, Timeperiod of interest

In [None]:
# store list of userids (numeric type!) here
# the code correctly handles userids that do not exist (e.g. 999)
userlist <- list(36327895, 25816024, 21788339, 136266976,999)

# start end end-time of period of interest, String: %Y-%M-%D
start_time <- "2017-07-06" #first tweet in database
end_time <- "2018-03-28" #6 months after election

### MongoDB

In [None]:
collection <- "politicians_sample" 
username <- "btw17_public_user"
password <- "btw17_public"
ip <- "10.6.13.55"
db = "twitter_public"


## Functions

### save_tweets_of_userid()
- This functions pulls all the tweets attached to a Twitter user from MongoDB posted within the given period of time (start_time, end_time)
- Each user's tweets are returned as a dataframe
- The tweets are stored in a dataframe in the destination folder
- If a destination folder is specified (relative path!), the dataframe gets stored there. Otherwise a default folder called "MongoDB dataframes/" is created

In [None]:
save_tweets_of_userid <- function(userid, start_time, end_time, destination_folder = NULL){
    
    # destination specified -> destination is created
    if(!is.null(destination_folder)){
        
        if(!dir.exists(destination_folder)){  
            dir.create(destination_folder, recursive = TRUE)
        }
    
    # otherwise a default directory is created
    } else{
        
        if(!dir.exists("data/MongoDB_dataframes/")){  
            dir.create("data/MongoDB_dataframes/", recursive = TRUE)
        }
        
        destination_folder <- "data/MongoDB_dataframes/"
    }
        
    # connect to the database
    m <- mongo(collection = collection, 
               url = paste0('mongodb://',username,':',password,'@',ip,'/',db))
    
    # create the query. First, a list with the search parameters is created
    # and then it is transformed to json format
    qry_json <- list(createdAt = list("$gte" = list("$date" = paste0(start_time, "T00:00:00Z")),
                                      "$lte" = list("$date" = paste0(end_time, "T00:00:00Z"))), 
                     user.id = userid
                ) %>% 
                toJSON(pretty = F, 
                       auto_unbox = T,
                       Date = c("ISO8601"))
    
    # specify the required fields. All possible fields are requested from the database
    flds_json <- list("_id" = 1,
                      user.id = 1, 
                      user.name = 1, 
                      user.screenName = 1, 
                      isRetweet = 1,
                      createdAt = 1, 
                      text = 1, 
                      source = 1,
                      possiblySensitive = 1
                     ) %>%
                toJSON(pretty = F,
                       auto_unbox = T,
                       Date = "ISO8601")
    
    # get the result
    mongo_res <- m$find(
        query = qry_json,
        fields = flds_json
    )
    
    # results are transformed to a tidy dataframe (tibble)
    tweets_tbl <- as_tibble(mongo_res)
    
    # throw error if no tweets were found
    if((nrow(tweets_tbl)) == 0){
        
        return("NO_TWEETS_FOUND")
        
    } else {
        
        # save each DF in destination folder
        saveRDS(tweets_tbl, file = paste0(destination_folder, userid, ".rds"))
        return ("SUCCESS")
    }
    
    
}

### save_tweets_of_userlist()
- This function saves all the tweets for each user in a userlist using save_tweets_of_userid()
    - If there are no tweets in the database, nothing is saved
- The tweets of each user are stored in separate dataframes (one per user) in the destination folder
- The name of each file is userid.RDS

In [None]:
save_tweets_of_userlist <- function(userlist, start_time, end_time, destination_folder){
    
    # counter for null results
    notweetsfoundcount <- 0
    
    # iterate through userlist and save tweets of userid
    for(userid in userlist){
        
        # save_tweets_of_userid functions returns an error if no tweets were found
        # otherwise dataframe is saved
        error <- save_tweets_of_userid(userid, start_time, end_time, destination_folder)
        
        # count 1 if error is thrown
        if(error == "NO_TWEETS_FOUND"){
            notweetsfoundcount <- notweetsfoundcount +1
        }
        
    }
    
    # print counter
    print(paste(notweetsfoundcount, "of", length(userlist), "users did not tweet/have no tweets in the database for the observed time period"))
    
}

## Run

In [None]:
print("Dataframes are saved in data/MongoDB_dataframes/")
save_tweets_of_userlist(userlist, start_time, end_time, destination_folder = "data/MongoDB_dataframes/")