# 4. Clean and annotate dataframes

**In this step, the merged dataframes are annotated with a set of flags (i.a. typos, replies, tweet properties).**

## Libraries

In [None]:
library(tidyverse)
library(httr)
library(jsonlite)
library(stringdist)

## Parameters

In [None]:
# Insert twitter bearer token here
twitter_token <- 

# directory in which the merged dataframes are stored
# only change if a the default was changed in step 3
merged_folder <- "Merged dataframes/" 

# disable scientific notation for long integers
options(scipen = 999)

## Typo Detection

### add_distance_columns_save()
- Here, a list of dataframes stored in the merged folder is created, each file is read and arranged by creation date
- Shifted text columns are created (depth 1-3) and the levenshtein distance is calculated for each text/shifted text pair
- The file is stored in a new folder (merged_distance)

In [None]:
add_distance_columns_save <- function(merged_folder){
    
    # default folder for the dataframes with distance column, can be changed if needed
    destination_folder <- "Merged with distance/"
    
    if(!dir.exists(destination_folder)){  
            dir.create(destination_folder, recursive = TRUE)
        }
    
    # create a list of files in merged_folder
    df_list <- list.files(path = merged_folder, include.dirs = TRUE)
    
    for (file in df_list){
        
        df <- readRDS(file = paste0(merged_folder, file)) %>% 
        arrange(createdAt) %>% #arrange by date!
        mutate(lead1 = lead(text, n=1),
               lead2 = lead(text, n=2),
               lead3 = lead(text, n=3),
               lead1_dist = stringdist(text, lead1, method = "lv"),
               lead2_dist = stringdist(text, lead2, method = "lv"),
               lead3_dist = stringdist(text, lead3, method = "lv")
              ) %>%
        # shifted columns are removed
        select(-lead1, -lead2, -lead3)
        
    # file is saved    
    saveRDS(df, file = paste0(destination_folder, file))
    
    }

return("SUCCESS")
}

# Run function
add_distance_columns_save(merged_folder)

## Finalize dataframe

### Create long dataframe
- Read each dataframe of user, bind to long dataframe

In [None]:
df_list <- list.files(path = "Merged with distance/", include.dirs = TRUE)
length(df_list)

adata <- map_df(df_list, function(x){
    
    res <- readRDS(file = paste0("Merged with distance/", x))
    
    return (res)
})

glimpse(adata)

### Create missing column
- Simple binary column to flag missing tweets

In [None]:
bdata <- adata %>% mutate(missing = as.integer(ifelse(!(is.na(error)),1,0)))
glimpse(bdata)

### Create isreply column
- Simple binary column to flag reply tweets

In [None]:
cdata <- bdata %>% mutate(isreply = as.integer(ifelse(!is.na(in_reply_to_user_id), 1, 0)))
glimpse(cdata)

### Create typo column
- Almuhimedi (2013) propose an edit distance of 5

In [None]:
ddata <- cdata %>% 
    mutate(is_typo = ifelse(missing == 1 & (lead1_dist <= 5 | lead2_dist <= 5 | lead3_dist <= 5) == TRUE, 1, 0))

glimpse(ddata)

### Categorization of usage behaviour

In [None]:
### Usage
usage <- ddata %>% group_by(user_id) %>% summarise(usage = case_when(n() >= 1000 ~ "heavy_user",
                                                                     n() >= 100 ~ "frequent_user",
                                                                     n() >= 10 ~ "light_user",
                                                                     n() < 10 ~ "inactive_user"))

### Deletions
deletions <- ddata %>% group_by(user_id) %>% 
    summarise(deletion_behaviour = case_when(sum(missing == 1)/n() == 1 ~ "full_deleter",
                                             sum(missing == 1)/n() == 0 ~ "non_deleter",
                                             sum(missing == 1)/n() >= 0.75 ~ "heavy_deleter",
                                             sum(missing == 1)/n() >= 0.5 ~ "selective_keeper",
                                             sum(missing == 1)/n() < 0.5 ~ "selective_deleter"))

### Merge with ddata
tempdata <- merge(ddata, usage, by = "user_id")
edata <- merge(tempdata, deletions, by = "user_id")
glimpse(edata)

### Categorization of tweets
- Typos, hashtags, mentions, links, retweet
- Typo if tweet was deleted and levenshtein distance for next, second next, third next is <=5

In [None]:
fdata <- edata %>% mutate(is_typo = ifelse(missing == 1 & (lead1_dist <= 5 | lead2_dist <= 5 | lead3_dist <= 5) == TRUE, 1, 0),
                          has_hashtag = ifelse(str_detect(text, "#[a-z0-9_A-Z-]+") == TRUE, 1, 0),
                          has_mention = ifelse(str_detect(text, "@([a-zA-Z0-9_-]+)") == TRUE, 1, 0),
                          has_link = ifelse(str_detect(text, "https:") == TRUE, 1, 0),
                          is_retweet = ifelse(str_detect(text, "RT @") == TRUE, 1, 0),
                         )
glimpse(fdata)

In [None]:
# save fdata -> Final dataframe
saveRDS(fdata, "result_df.rds")