# 4. Clean and annotate dataframes

**In this step, the merged dataframes are annotated with a set of flags (i.a. typos, replies, tweet properties).**

## Libraries

In [1]:
library(tidyverse)
library(httr)
library(jsonlite)
library(stringdist)

── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.6     ✔ dplyr   1.0.8
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1
── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘jsonlite’

The following object is masked from ‘package:purrr’:

    flatten


Attaching package: ‘stringdist’

The following object is masked from ‘package:tidyr’:

    extract



## Parameters

In [2]:

# directory in which the merged dataframes are stored
# only change if a the default was changed in step 3
merged_folder <- "data/Merged_dataframes/" 

# destination folder where dataframes will be stored
destination_folder <- "data/Merged_with_distance/"

# disable scientific notation for long integers
options(scipen = 999)

## Typo Detection

### add_distance_columns_save()
- Here, a list of dataframes stored in the merged folder is created, each file is read and arranged by creation date
- Shifted text columns are created (depth 1-3) and the levenshtein distance is calculated for each text/shifted text pair
- The file is stored in a new folder (merged_distance)

In [3]:
add_distance_columns_save <- function(merged_folder, destination_folder){
    
    
    if(!dir.exists(destination_folder)){  
            dir.create(destination_folder, recursive = TRUE)
        }
    
    # create a list of files in merged_folder
    df_list <- list.files(path = merged_folder, include.dirs = TRUE)
    
    for (file in df_list){
        
        df <- readRDS(file = paste0(merged_folder, file)) %>% 
        arrange(createdAt) %>% #arrange by date!
        mutate(lead1 = lead(text, n=1),
               lead2 = lead(text, n=2),
               lead3 = lead(text, n=3),
               lead1_dist = stringdist(text, lead1, method = "lv"),
               lead2_dist = stringdist(text, lead2, method = "lv"),
               lead3_dist = stringdist(text, lead3, method = "lv")
              ) %>%
        # shifted columns are removed
        select(-lead1, -lead2, -lead3)
        
        # file is saved    
        saveRDS(df, file = paste0(destination_folder, file))
    
    }

return("SUCCESS")
}

# Run function
add_distance_columns_save(merged_folder, destination_folder)

## Finalize dataframe

### Create long dataframe
- Read each dataframe of user, bind to long dataframe

In [4]:
df_list <- list.files(path = destination_folder, include.dirs = TRUE)
length(df_list)

adata <- map_df(df_list, function(x){
    
    res <- readRDS(file = paste0(destination_folder, x))
    
    return (res)
})

glimpse(adata)

Rows: 10
Columns: 14
$ tweet_id            <chr> "911716024331980800", "911716036184989696", "91170…
$ createdAt           <dttm> 2017-09-24 00:16:53, 2017-09-24 00:16:56, 2017-09…
$ possiblySensitive   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ source              <chr> "<a href=\"http://twitter.com/download/android\" r…
$ text                <chr> "RT @linke_sh: Ab morgen, 8 Uhr, gilt es: @dieLink…
$ isRetweet           <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE…
$ user_id             <dbl> 136266976, 136266976, 21788339, 21788339, 25816024…
$ name                <chr> "Lorenz Gösta Beutin", "Lorenz Gösta Beutin", "Chr…
$ screenName          <chr> "lgbeutin", "lgbeutin", "crmkoch", "crmkoch", "Gab…
$ error               <chr> "Not Found Error", "Not Found Error", "Not Found E…
$ in_reply_to_user_id <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ lead1_dist          <dbl> 58, NA, 91, NA, NA, 112, 122, 121, 117, NA
$ lead2_dist          <dbl> NA, NA, NA, N

### Create missing column
- Simple binary column to flag missing tweets

In [5]:
bdata <- adata %>% mutate(missing = as.integer(ifelse(!(is.na(error)),1,0)))
glimpse(bdata)

Rows: 10
Columns: 15
$ tweet_id            <chr> "911716024331980800", "911716036184989696", "91170…
$ createdAt           <dttm> 2017-09-24 00:16:53, 2017-09-24 00:16:56, 2017-09…
$ possiblySensitive   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ source              <chr> "<a href=\"http://twitter.com/download/android\" r…
$ text                <chr> "RT @linke_sh: Ab morgen, 8 Uhr, gilt es: @dieLink…
$ isRetweet           <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE…
$ user_id             <dbl> 136266976, 136266976, 21788339, 21788339, 25816024…
$ name                <chr> "Lorenz Gösta Beutin", "Lorenz Gösta Beutin", "Chr…
$ screenName          <chr> "lgbeutin", "lgbeutin", "crmkoch", "crmkoch", "Gab…
$ error               <chr> "Not Found Error", "Not Found Error", "Not Found E…
$ in_reply_to_user_id <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ lead1_dist          <dbl> 58, NA, 91, NA, NA, 112, 122, 121, 117, NA
$ lead2_dist          <dbl> NA, NA, NA, N

### Create isreply column
- Simple binary column to flag reply tweets

In [6]:
cdata <- bdata %>% mutate(isreply = as.integer(ifelse(!is.na(in_reply_to_user_id), 1, 0)))
glimpse(cdata)

Rows: 10
Columns: 16
$ tweet_id            <chr> "911716024331980800", "911716036184989696", "91170…
$ createdAt           <dttm> 2017-09-24 00:16:53, 2017-09-24 00:16:56, 2017-09…
$ possiblySensitive   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ source              <chr> "<a href=\"http://twitter.com/download/android\" r…
$ text                <chr> "RT @linke_sh: Ab morgen, 8 Uhr, gilt es: @dieLink…
$ isRetweet           <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE…
$ user_id             <dbl> 136266976, 136266976, 21788339, 21788339, 25816024…
$ name                <chr> "Lorenz Gösta Beutin", "Lorenz Gösta Beutin", "Chr…
$ screenName          <chr> "lgbeutin", "lgbeutin", "crmkoch", "crmkoch", "Gab…
$ error               <chr> "Not Found Error", "Not Found Error", "Not Found E…
$ in_reply_to_user_id <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ lead1_dist          <dbl> 58, NA, 91, NA, NA, 112, 122, 121, 117, NA
$ lead2_dist          <dbl> NA, NA, NA, N

### Create typo column
- Almuhimedi (2013) propose an edit distance of 5

In [7]:
ddata <- cdata %>% 
    mutate(is_typo = ifelse(missing == 1 & (lead1_dist <= 5 | lead2_dist <= 5 | lead3_dist <= 5) == TRUE, 1, 0))

glimpse(ddata)

Rows: 10
Columns: 17
$ tweet_id            <chr> "911716024331980800", "911716036184989696", "91170…
$ createdAt           <dttm> 2017-09-24 00:16:53, 2017-09-24 00:16:56, 2017-09…
$ possiblySensitive   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ source              <chr> "<a href=\"http://twitter.com/download/android\" r…
$ text                <chr> "RT @linke_sh: Ab morgen, 8 Uhr, gilt es: @dieLink…
$ isRetweet           <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE…
$ user_id             <dbl> 136266976, 136266976, 21788339, 21788339, 25816024…
$ name                <chr> "Lorenz Gösta Beutin", "Lorenz Gösta Beutin", "Chr…
$ screenName          <chr> "lgbeutin", "lgbeutin", "crmkoch", "crmkoch", "Gab…
$ error               <chr> "Not Found Error", "Not Found Error", "Not Found E…
$ in_reply_to_user_id <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ lead1_dist          <dbl> 58, NA, 91, NA, NA, 112, 122, 121, 117, NA
$ lead2_dist          <dbl> NA, NA, NA, N

### Categorization of usage behaviour

In [8]:
### Usage
usage <- ddata %>% group_by(user_id) %>% summarise(usage = case_when(n() >= 1000 ~ "heavy_user",
                                                                     n() >= 100 ~ "frequent_user",
                                                                     n() >= 10 ~ "light_user",
                                                                     n() < 10 ~ "inactive_user"))

### Deletions
deletions <- ddata %>% group_by(user_id) %>% 
    summarise(deletion_behaviour = case_when(sum(missing == 1)/n() == 1 ~ "full_deleter",
                                             sum(missing == 1)/n() == 0 ~ "non_deleter",
                                             sum(missing == 1)/n() >= 0.75 ~ "heavy_deleter",
                                             sum(missing == 1)/n() >= 0.5 ~ "selective_keeper",
                                             sum(missing == 1)/n() < 0.5 ~ "selective_deleter"))

### Merge with ddata
tempdata <- merge(ddata, usage, by = "user_id")
edata <- merge(tempdata, deletions, by = "user_id")
glimpse(edata)

Rows: 10
Columns: 19
$ user_id             <dbl> 21788339, 21788339, 25816024, 36327895, 36327895, …
$ tweet_id            <chr> "911702953324457984", "911715368904691712", "91166…
$ createdAt           <dttm> 2017-09-23 23:24:57, 2017-09-24 00:14:17, 2017-09…
$ possiblySensitive   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ source              <chr> "<a href=\"http://instagram.com\" rel=\"nofollow\"…
$ text                <chr> "Letzter Infostand in #Flensburg vor der #btw2017 …
$ isRetweet           <lgl> FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ name                <chr> "Christian R.M. Lucks", "Christian R.M. Lucks", "G…
$ screenName          <chr> "crmkoch", "crmkoch", "GabiHillerOhm", "SoenkeRix"…
$ error               <chr> "Not Found Error", NA, "Not Found Error", "Not Fou…
$ in_reply_to_user_id <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ lead1_dist          <dbl> 91, NA, NA, 112, 122, 121, 117, NA, 58, NA
$ lead2_dist          <dbl> NA, NA, NA, 1

### Categorization of tweets
- Typos, hashtags, mentions, links, retweet
- Typo if tweet was deleted and levenshtein distance for next, second next, third next is <=5

In [9]:
fdata <- edata %>% mutate(is_typo = ifelse(missing == 1 & (lead1_dist <= 5 | lead2_dist <= 5 | lead3_dist <= 5) == TRUE, 1, 0),
                          has_hashtag = ifelse(str_detect(text, "#[a-z0-9_A-Z-]+") == TRUE, 1, 0),
                          has_mention = ifelse(str_detect(text, "@([a-zA-Z0-9_-]+)") == TRUE, 1, 0),
                          has_link = ifelse(str_detect(text, "https:") == TRUE, 1, 0),
                          is_retweet = ifelse(str_detect(text, "RT @") == TRUE, 1, 0),
                         )
glimpse(fdata)

Rows: 10
Columns: 23
$ user_id             <dbl> 21788339, 21788339, 25816024, 36327895, 36327895, …
$ tweet_id            <chr> "911702953324457984", "911715368904691712", "91166…
$ createdAt           <dttm> 2017-09-23 23:24:57, 2017-09-24 00:14:17, 2017-09…
$ possiblySensitive   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ source              <chr> "<a href=\"http://instagram.com\" rel=\"nofollow\"…
$ text                <chr> "Letzter Infostand in #Flensburg vor der #btw2017 …
$ isRetweet           <lgl> FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ name                <chr> "Christian R.M. Lucks", "Christian R.M. Lucks", "G…
$ screenName          <chr> "crmkoch", "crmkoch", "GabiHillerOhm", "SoenkeRix"…
$ error               <chr> "Not Found Error", NA, "Not Found Error", "Not Fou…
$ in_reply_to_user_id <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ lead1_dist          <dbl> 91, NA, NA, 112, 122, 121, 117, NA, 58, NA
$ lead2_dist          <dbl> NA, NA, NA, 1

In [10]:
# save fdata -> Final dataframe
saveRDS(fdata, "data/result_df.rds")