# 3. Merge raw dataframes

**In this step, the dataframes from step 1 are combined with the error column of the rehydrated dataframes of step 2. Thereby each tweet is annotated as missing or not.**

## Libraries

In [1]:
library(tidyverse)

── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.6     ✔ dplyr   1.0.8
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1
── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


## Parameters

In [2]:
# store list of userids (numeric type!) here
userlist <- list(36327895, 25816024, 21788339, 136266976,999)

# directory in which the dataframe from MongoDB are stored
# only change if a the default was changed in step 1
mongo_folder <- "data/MongoDB_dataframes/" 

# directory in which the rehydrated dataframes are stored
# only change if a the default was changed in step 2
twitter_folder <- "data/MongoDB_rehydrated_dataframes/"

## Functions

### merge_error_column
- This function merges the error column and the reply column of the rehydrated dataframe with the mongoDB dataframe

In [3]:
merge_error_column <- function(userid, mongo_folder, twitter_folder){
    
    if(!file.exists(paste0(mongo_folder, userid, ".rds"))){
        return (NULL)
        
    }else{
        
        mtype <- readRDS(paste0(mongo_folder, userid, ".rds")) %>% 
            tidyr::unnest(user, names_repair = "unique") %>%
            rename(tweet_id = `_id`, user_id = id) %>% 
            mutate(tweet_id = as.character(tweet_id))
        
        rawtwitter <- readRDS(paste0(twitter_folder, userid, ".rds")) %>% 
            rename(tweet_id = id)
        
        if("in_reply_to_user_id" %in% colnames(rawtwitter)){
            etype <- rawtwitter %>% 
                select(tweet_id, error, in_reply_to_user_id)
        }else{
            etype <- rawtwitter %>% 
                select(tweet_id, error) %>% 
                mutate(in_reply_to_user_id = NA)        
        }
        
        res <- merge(mtype, etype, by = "tweet_id", all.x = TRUE)
    }
    return(res)
}

### save_dfs()
- This function creates the final dataframes for all users and stores them in the destination_folder if it is

In [4]:
save_dfs <- function(userlist, mongo_folder, twitter_folder, destination_folder = NULL){
    
    # destination specified -> destination is created
    if(!is.null(destination_folder)){
        
        if(!dir.exists(destination_folder)){  
            dir.create(destination_folder, recursive = TRUE)
        }
    
    # otherwise a default directory is created
    } else{
        
        if(!dir.exists("data/Merged_dataframes/")){  
            dir.create("data/Merged_dataframes/", recursive = TRUE)
        }
        
        destination_folder <- "data/Merged_dataframes/"
    }
    
    for(userid in userlist){
        
        df <- merge_error_column(userid, mongo_folder, twitter_folder)
        
        if(!is.null(df)){ 
            
            result <- df
            
            saveRDS(result, file = paste0(destination_folder, userid, ".rds"))
            
        }else {next}    
    
    }
    
    return("SUCCESS")
}

## Run

In [5]:
save_dfs(userlist, mongo_folder, twitter_folder, destination_folder = "data/Merged_dataframes/")