_Does size matter? The effect of Instagram influencer account size on post sentiment and resulting marketing outcomes_

_Master's thesis by Thomas A. Frost_

# Part 2: Hashtag Cleaning

This file performs data cleaning that is necessary **before** the execution of sentiment analysis by SiEBERT

## 02 - Setup

In [None]:
library(tidyverse)
library(rlist)

## 03 - Data Import

In [None]:
posts <- read_delim("../data/Instagram__Posts_corrected.tsv", delim = "\t", quote="", col_types = "ccnTccnnccccc")

## 04 - Split posts in words

In [None]:
posts$Words <- strsplit(posts$Text, " ")

## 05 - Hashtag detection


In [None]:
hashtag_detection <- function(strng) {
  return(substring(strng, 1, 1) == "#")
}

posts$Hashtags <- lapply(posts$Words, hashtag_detection)

## 06 - Count hashtags

In [None]:
# Count number of following(!) Hashtags

count_hashtags <- function(liste) {
  counter <- 0
  b <- list()
  listlength <- length(unlist(liste))
  if (is.na(liste[1]) == FALSE) {
    for (i in 1:listlength) {
      if (unlist(liste)[i]) {
        if (counter == 0) {
          counter <- counter + 1
          if (i+1 <= listlength) {
            for (j in (i+1):listlength) {
              if (unlist(liste)[j]) {
                counter <- counter + 1
              } else if (unlist(liste)[j] == FALSE) {
                break
              }
            }
          }
        }
      } else {
        counter <- 0
      }
      b <- list.append(b, counter)
    }
  }
  return(b)
}

posts$NumHashtags <- lapply(posts$Hashtags, count_hashtags)

# Count number of all hashtags in a single post

count_all_hashtags <- function(liste) {
  return(sum(unlist(liste)))
}

posts$TotalNumHashtags <- lapply(posts$Hashtags, count_all_hashtags)

posts$TotalNumHashtags <- unlist(posts$TotalNumHashtags)

## 07 - decide on hashtag deletion

In [None]:
deletebin <- function(liste) {
  output <- unlist(liste) >= 2
  return(output)
}

posts$delete <- lapply(posts$NumHashtags, deletebin)

check <- posts[c(6,14,18)]

### 07.05 - plausability check

In [None]:
posts$html <- ''

for (i in 1:dim(posts)[1]) {
  text <- ''
  text <- paste(text, '<p>')
  for (j in 1:length(unlist(posts$Words[i]))) {
    if (is.na(unlist(posts$Words[i])[j]) == FALSE) {
      if (unlist(posts$delete[i])[j] == TRUE) {
        text <- paste(text,'<span class="del">', unlist(posts$Words[i])[j], '</span> ')
      } else if (unlist(posts$delete[i])[j] == FALSE) {
        text <- paste(text, unlist(posts$Words[i])[j])
      }
    }
  }
  text <- paste(text, '</p>')
  posts$html[i] <- text
}

output <- as.data.frame(posts$html)
write_csv(output, 'test.html')

## 08 - Rebuild Post Texts

In [None]:
for (i in 1:dim(posts)[1]) {
  text <- ''
  for (j in 1:length(unlist(posts$Words[i]))) {
    if (is.na(unlist(posts$Words[i])[j]) == FALSE) {
      if (unlist(posts$delete[i])[j] == FALSE) {
        text <- paste(text, unlist(posts$Words[i])[j])
      }
    }
  }
  posts$finaltext[i] <- text
}

## 09 - Export to final dataset file

In [None]:
export <- posts
export$Text <- export$finaltext
export <- select(export, -c("Words", "Hashtags", "NumHashtags", "delete", "finaltext"))

In [None]:
write_tsv(export, "../data/Instagram__Posts_corrected_v5.tsv", na = "")