_Does size matter? The effect of Instagram influencer account size on post sentiment and resulting marketing outcomes_

_Master's thesis by Thomas A. Frost_

# Part 0: SocialBlade Scraper

It is necessary to download the SocialBlade pages as html files and save them in the corresponding folder. The GitHub folder contains a set of downloaded profiles

This file serves two puropses:

1. Determine suitable accounts for study 2 by manually reviewing all graphs of total media count per user over time (section 5 "profile selection")
1. Data collection for the study 2 datasets (sections 6-8)

## 01 - Library Installs

In [None]:
install.packages('rvest')
install.packages('textclean')
install.packages('xml2')
install.packages('tidyverse')
install.packages('jsonlite')

## 02 - Loading

In [None]:
library(xml2)
library(tidyverse)
library(rvest)
library(textclean)
library(jsonlite)

## 03 - Functions

In [None]:
getSBFromHTML <- function(file, diagrname) {
  # read HTML file
  html_read <- read_html(file)

  # identify script tags
  scripts <- html_elements(x = html_read, css = 'body script')

  # identify script tag with Highchart data and replace escaped characters
  script <- html_text(scripts[grepl('Highcharts', html_text(scripts))])
  script <- replace_white(script)

  # identify single diagram definitions
  fc_pos <- gregexec('Highcharts\\.chart\\(.*?\\{', script)

  start <- as.vector(unlist(fc_pos)) + as.vector(attr(fc_pos[[1]], "match.length")) - 1
  stop <- as.vector(unlist(fc_pos))[-1] - 4

  stop <- append(stop, nchar(script) - 7)

  fc_pos <- data.frame(start, stop)

  # extract single diagram definitions
  function_calls <- substring(
    text = script,
    first = fc_pos$start,
    last = fc_pos$stop
  )

  # search for diagram names
  name_pos <- gregexec("Highcharts\\.chart\\('.*?'", script)

  names <- substring(
    text = script,
    first = as.vector(unlist(name_pos)) + 18,
    last = as.vector(unlist(name_pos)) + as.vector(attr(name_pos[[1]], "match.length")) - 2
  )

  # search for diagram data
  data_pos <- gregexec('data: \\[\\[.*?\\]\\]', function_calls)

  start <- unlist(data_pos)
  stop <-  start + as.vector(unlist(lapply(data_pos, function(listitem) {attr(listitem, "match.length")})))

  data_pos <- data.frame(start = start + 6, stop)

  # extract diagram data
  data <- substring(
    text = function_calls,
    first = data_pos$start,
    last = data_pos$stop
  )

  data <- data.frame(data, name = names)

  # save diagram data as R data object
  return(as.data.frame(fromJSON(
    filter(data, name == diagrname)$data
  )))
}

getsize <- function(followers) {
  result <- case_when(
    (followers <= 1000) ~ NA,
    (followers > 1000 && followers <= 10000) ~ "nano",
    (followers > 10000 && followers <= 50000) ~ "mikro",
    (followers > 50000 && followers <= 500000) ~ "midtier",
    (followers > 500000 && followers <= 1000000) ~ "makro",
    (followers > 1000000) ~ "mega"
  )
  return(result)
}


# Function to show the graph of number of media items in profile
getMediaPlot <- function(user) {
  df <- getSBFromHTML(paste('socialblade_data/', user, '.htm', sep = ""), 'graph-instagram-monthly-media-container')
  df$V1 <- as.Date(as.POSIXct(df$V1 / 1000, origin="1970-01-01"))

  plot <- ggplot(data = df, aes(x = V1, y = V2)) +
    geom_point() +
    geom_line() +
    ylab("Number of posts online") +
    xlab("Date") +
    ggtitle(paste("graph for user", user)) +
    theme_light()

  plot
}


# Function to return followers of a specific user
getFollowers <- function(user) {
  df <- getSBFromHTML(paste('socialblade_data/', user, '.htm', sep = ""), 'graph-instagram-monthly-followers-container')
  df$V1 <- as.Date(as.POSIXct(df$V1 / 1000, origin="1970-01-01"))
  df$size <- sapply(df$V2, getsize)
  df$Username <- user
  return(df)
}

# Function to return followees of a specific user
getFollowees <- function(user) {
  df <- getSBFromHTML(paste('socialblade_data/', user, '.htm', sep = ""), 'graph-instagram-monthly-following-container')
  df$V1 <- as.Date(as.POSIXct(df$V1 / 1000, origin="1970-01-01"))
  df$Username <- user
  return(df)
}

# Function to return followees of a specific user
getMediaCount <- function(user) {
  df <- getSBFromHTML(paste('socialblade_data/', user, '.htm', sep = ""), 'graph-instagram-monthly-media-container')
  df$V1 <- as.Date(as.POSIXct(df$V1 / 1000, origin="1970-01-01"))
  df$Username <- user
  return(df)
}

## 04 - Import
