In [1]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(lme4)
library(lmerTest)
library(MatchIt)
#library(optmatch)
library(effects)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: Matrix


Attaching package: ‘Matrix’


The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack


Loading required package: carData

lattice theme set by effectsTheme()
See ?effectsTheme for details.



In [2]:
setwd('/shared/3/projects/style-influence')

In [3]:
remove_outlier <- function(x, thr=10){
    iqr = IQR(x, na.rm=T)
    lo = quantile(x, 0.25, na.rm=T) - thr*iqr
    hi = quantile(x, 0.75, na.rm=T) + thr*iqr
    x[x < lo] = lo
    x[x > hi] = hi
    x
}

In [4]:
offensive = lapply(2019:2022, 
                   function(y) fread(paste0('data/final_samples/offensive_',y,'.tsv'), sep='\t'))
offensive = rbindlist(offensive)                   
                   
offensive$num_tokens <- unlist(lapply(offensive$tokens, function(x) length(strsplit(x,',')[[1]])))
offensive$parent_num_tokens <- unlist(lapply(offensive$tokens_parent, function(x) length(strsplit(x,',')[[1]])))

In [None]:
scores = fread('working-dir/all_scores.offensive.csv')
s = scores %>% dplyr::select(comment_id = column_id, offensive = score)
setkey(s, comment_id)
setkey(offensive, comment_id)
offensive <- inner_join(offensive, s, by='comment_id')

s = scores %>% dplyr::select(parent_id = column_id, parent_offensive = score)
setkey(s, parent_id)
setkey(offensive, parent_id)
offensive = inner_join(offensive, s, by='parent_id')

In [None]:
offensive <- offensive %>%
    mutate(#offensive = scale(offensive),
           #parent_offensive = scale(parent_offensive),
           num_tokens = scale(remove_outlier(as.numeric(num_tokens))),
           parent_num_tokens = scale(remove_outlier(as.numeric(parent_num_tokens))),
           score = remove_outlier(as.numeric(score)),
           parent_score = remove_outlier(as.numeric(parent_score)),
           depth = remove_outlier(as.numeric(depth)),
           max_depth = remove_outlier(as.numeric(max_depth)),
           controversiality = as.integer(controversiality),
           parent_controversiality = as.integer(parent_controversiality)
          )

In [None]:
setkey(offensive, comment_id)
offensive$style = offensive$offensive
offensive$parent_style = offensive$parent_offensive

In [625]:
mod <- lm(style ~ parent_style + 
          num_tokens + parent_num_tokens + depth, offensive)
summary(mod)


Call:
lm(formula = style ~ parent_style + num_tokens + parent_num_tokens + 
    depth, data = offensive)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.34778 -0.03609 -0.03015 -0.01246  0.93231 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        3.349e-02  9.766e-05  342.96   <2e-16 ***
parent_style       3.214e-01  3.706e-04  867.19   <2e-16 ***
num_tokens         6.089e-03  5.304e-05  114.79   <2e-16 ***
parent_num_tokens -3.883e-03  5.302e-05  -73.24   <2e-16 ***
depth              8.247e-04  2.742e-05   30.08   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1142 on 5663240 degrees of freedom
Multiple R-squared:  0.1191,	Adjusted R-squared:  0.1191 
F-statistic: 1.914e+05 on 4 and 5663240 DF,  p-value: < 2.2e-16


In [4]:
formality = fread('data/final_samples/formality_allmetrics.tsv', sep='\t')

In [5]:
formality <- formality[controversiality %in% c('0','0.0','1','1.0'),]
formality <- formality %>%
    mutate(#formality = scale(formality),
           #parent_formality = scale(parent_formality),
           num_parent_markers = remove_outlier(num_parent_markers),
           num_markers = remove_outlier(num_markers),
           parent_num_markers = remove_outlier(parent_num_markers),
           num_tokens = scale(remove_outlier(as.numeric(num_tokens))),
           parent_num_tokens = scale(remove_outlier(as.numeric(parent_num_tokens))),
           score = remove_outlier(as.numeric(score)),
           parent_score = remove_outlier(as.numeric(parent_score)),
           depth = remove_outlier(as.numeric(depth)),
           max_depth = remove_outlier(as.numeric(max_depth)),
           controversiality = as.integer(controversiality),
           parent_controversiality = as.integer(parent_controversiality),
           banned = ifelse(banned, 1, 0),
           wait_time = remove_outlier(created_utc - parent_created_utc)/60/24
          )
nrow(formality)


In [6]:
#formality$style <- formality$num_parent_markers
formality$style <- formality$num_markers
formality$parent_style <- formality$parent_num_markers

#formality$style <- formality$formality
#formality$parent_style <- formality$parent_formality

In [7]:
mod <- lm(style ~ parent_style + 
          num_tokens + parent_num_tokens + depth, formality)
summary(mod)
#hist(mod$residuals)


Call:
lm(formula = style ~ parent_style + num_tokens + parent_num_tokens + 
    depth, data = formality)

Residuals:
     Min       1Q   Median       3Q      Max 
-195.327   -1.161    0.004    1.310   91.134 

Coefficients:
                    Estimate Std. Error  t value Pr(>|t|)    
(Intercept)       14.5602056  0.0043291  3363.36   <2e-16 ***
parent_style       0.1755719  0.0002263   775.72   <2e-16 ***
num_tokens        25.0905566  0.0011035 22736.79   <2e-16 ***
parent_num_tokens -4.2619573  0.0058987  -722.53   <2e-16 ***
depth              0.0016327  0.0001351    12.09   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.912 on 16893008 degrees of freedom
Multiple R-squared:  0.9766,	Adjusted R-squared:  0.9766 
F-statistic: 1.762e+08 on 4 and 16893008 DF,  p-value: < 2.2e-16


In [None]:
con_ids = unique(paste(formality$author,formality$subreddit)[formality$controversiality==1])
con_ids = intersect(con_ids, unique(paste(formality$author,formality$subreddit)[formality$controversiality==0]))
ban_ids = unique(formality$author[formality$banned==1])
sum(paste(formality$author,formality$subreddit) %in% con_ids)

In [None]:
library(optmatch)
library(MatchIt)
#https://static1.squarespace.com/static/5d54a19a5a1edf0001ea677a/t/627b1c2fe12ee57a626635a7/1652235311535/Matching_Guide_pt1.pdf
em_region <- exactMatch(x = controversiality ~ author + subreddit, 
                        data = formality %>% filter(paste(author,subreddit) %in% con_ids))


In [None]:
euc_dist_polity <- match_on(x = controversiality ~ num_tokens + depth + created_utc + parent_controversiality,
                            data = formality %>% filter(paste(author,subreddit) %in% con_ids),
                            standardization.scale = NULL,
                            method = "euclidean")
euc_dist_polity_cal_2 <- euc_dist_polity + caliper(x = euc_dist_polity, width = 2)

In [8]:
#formality$style <- formality$num_parent_markers
#formality$style <- formality$num_markers
#formality$parent_style <- formality$parent_num_markers

formality$style <- formality$formality
formality$parent_style <- formality$parent_formality

In [9]:
ids = unique(formality$root_id[formality$controversiality==1 & formality$depth>=3 & 
                               formality$max_depth>=formality$depth+2])
length(ids)

In [11]:
cont_threads <- formality[root_id %in% ids,] %>%
    group_by(root_id) %>%
    mutate(cont_id = min(depth[controversiality==1 & depth>=3])) %>% 
    filter(depth <= cont_id+2 & depth >= cont_id-2) %>%
    mutate(depth_relative = factor(depth - cont_id, levels = c(0, -2, -1, 1, 2))) %>%
    select(root_id, author, controversiality, depth_relative,
           style, parent_style, num_markers, parent_num_markers, formality, parent_formality,
           num_tokens, parent_num_tokens, depth, parent_controversiality, cont_id)
head(cont_threads)

root_id,author,controversiality,depth_relative,style,parent_style,num_markers,parent_num_markers,formality,parent_formality,num_tokens,parent_num_tokens,depth,parent_controversiality,cont_id
<chr>,<chr>,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
eq42v6h,redhatmodteam,0,2,0.527,0.381,12,23,0.527,0.381,-0.3361925,0.13180661,8,0,6
eq42v6h,ShinningPeadIsAnti,0,1,0.381,0.647,23,3,0.381,0.647,0.1608381,-0.5711979,7,1,6
eq42v6h,redhatmodteam,1,0,0.647,0.593,3,21,0.647,0.593,-0.5639982,0.07154908,6,0,6
eq42v6h,ShinningPeadIsAnti,0,-1,0.593,0.378,21,7,0.593,0.378,0.0987093,-0.47076868,5,0,6
eq42v6h,redhatmodteam,0,-2,0.378,0.717,7,13,0.378,0.717,-0.4604501,-0.02888013,4,0,6
eq4mu5b,neihofft,0,2,0.829,0.516,41,34,0.829,0.516,0.906384,0.67412438,6,1,4


In [12]:
cont_ids = cont_threads %>% filter(depth == cont_id)
nrow(cont_ids)

In [13]:
setkey(formality,author)
Sys.time()
rows <- lapply(1:nrow(cont_ids),
function(i){
    s <- formality[cont_ids$author[i],][depth >= 3 & max_depth >= depth+2 & controversiality == 0,]
    if(nrow(s)>0) s <- s %>% filter(abs(depth - cont_ids$cont_id[i]) == min(abs(depth - cont_ids$cont_id[i]))) 
    if(nrow(s)>1) s <- s %>% sample_n(1)
    s
    })
Sys.time()


[1] "2023-04-23 16:42:31 EDT"

[1] "2023-04-23 18:03:40 EDT"

In [14]:
setkey(formality, root_id)
Sys.time()
rows <- lapply(rows,
function(s){
    if(nrow(s)==1){
        s <- formality[s$root_id,][depth>=s$depth-2 & depth<=s$depth+2,] 
        s <- s %>% 
                group_by(root_id) %>%
                mutate(cont_id = median(depth),
                       depth_relative = depth - cont_id) %>%
                select(root_id, author, controversiality, depth_relative,
                       style, parent_style, num_markers, parent_num_markers, formality, parent_formality,
                       num_tokens, parent_num_tokens, depth, parent_controversiality, cont_id)
    }else{
        s <- data.table()
    }
    s
})
Sys.time()


[1] "2023-04-23 18:03:49 EDT"

[1] "2023-04-23 18:22:07 EDT"

In [15]:
df = rbindlist(rows) %>% 
        mutate(depth_relative = factor(depth_relative, levels=c(0,-2,-1,1,2)),
               type = 'control')

df <- rbind(df,
            cont_threads %>% 
                filter(root_id %in% cont_ids$root_id[author %in% df$author]) %>% 
                mutate(type = 'treat')
            )

In [16]:
summary(df$depth_relative)

In [50]:
df <- df %>% 
left_join(unique(formality %>% select(root_id, subreddit)), by='root_id')

In [67]:
#df$style <- df$num_parent_markers
df$style <- df$num_markers
df$parent_style <- df$parent_num_markers

df$style <- df$formality
df$parent_style <- df$parent_formality

In [72]:
mod <- lm(style ~ parent_style * type * depth_relative + 
          num_tokens + parent_num_tokens + depth,
            df %>% filter(depth_relative %in% c(-1,1)) #%>% mutate(depth_relative = relevel(depth_relative, '-2'))
         )
summary(mod)


Call:
lm(formula = style ~ parent_style * type * depth_relative + num_tokens + 
    parent_num_tokens + depth, data = df %>% filter(depth_relative %in% 
    c(-1, 1)))

Residuals:
     Min       1Q   Median       3Q      Max 
-0.62523 -0.24921 -0.05756  0.24312  0.60599 

Coefficients:
                                         Estimate Std. Error t value Pr(>|t|)
(Intercept)                             0.4043695  0.0028420 142.283  < 2e-16
parent_style                            0.1344238  0.0042131  31.906  < 2e-16
typetreat                               0.0137290  0.0035398   3.878 0.000105
depth_relative1                        -0.0036206  0.0035006  -1.034 0.300997
num_tokens                              0.0265824  0.0006825  38.947  < 2e-16
parent_num_tokens                       0.0053012  0.0007515   7.054 1.74e-12
depth                                   0.0009279  0.0004538   2.045 0.040902
parent_style:typetreat                 -0.0152653  0.0063287  -2.412 0.015863
parent_sty

In [73]:
mod <- lmer(style ~ parent_style * type * depth_relative + 
          num_tokens + parent_num_tokens + depth + 
            (1|author) + (1|subreddit),
            df %>% filter(depth_relative %in% c(-1,1)) #%>% mutate(depth_relative = relevel(depth_relative, '-2'))
         )
summary(mod)

Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: 
style ~ parent_style * type * depth_relative + num_tokens + parent_num_tokens +  
    depth + (1 | author) + (1 | subreddit)
   Data: df %>% filter(depth_relative %in% c(-1, 1))

REML criterion at convergence: 74082.3

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.7026 -0.7040 -0.1599  0.6938  2.8961 

Random effects:
 Groups    Name        Variance Std.Dev.
 author    (Intercept) 0.021006 0.14493 
 subreddit (Intercept) 0.004259 0.06526 
 Residual              0.067706 0.26020 
Number of obs: 198941, groups:  author, 68717; subreddit, 9534

Fixed effects:
                                         Estimate Std. Error         df t value
(Intercept)                             4.175e-01  3.048e-03  6.660e+04 136.973
parent_style                            8.939e-02  3.999e-03  1.897e+05  22.353
typetreat                               1.018e-02  3.303e-03  1.848e+05   3.083
dept

In [27]:
nrow(df)