In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scipy.stats as stats


In [2]:
comments_df = pd.read_csv("comments.csv", lineterminator='\n')
submissions_df = pd.read_csv("submissions.csv", lineterminator='\n')


In [3]:

#get the three groups
positive_popularity = submissions_df[submissions_df['sentiment'] == 1]['scaled_popularity_score']
neutral_popularity = submissions_df[submissions_df['sentiment'] == 0]['scaled_popularity_score']
negative_popularity = submissions_df[submissions_df['sentiment'] == -1]['scaled_popularity_score']


#we can assume normally distributed data based on teh number of samples here 

print("Number of positive samples:", positive_popularity.count())
print("Number of neutral samples:", neutral_popularity.count())
print("Number of negative samples:", negative_popularity.count())

print("\n\n")

print("Result of ANOVA test: \n\n The p value is: ")

print(stats.f_oneway(positive_popularity, neutral_popularity, negative_popularity).pvalue)





Number of positive samples: 22136
Number of neutral samples: 58305
Number of negative samples: 11990



Result of ANOVA test: 

 The p value is: 
1.774831259377419e-58


In [4]:
#since the p value is very small, we can do the tukey test to see which means are different 
tukey = pairwise_tukeyhsd(submissions_df['scaled_popularity_score'], groups=submissions_df['sentiment'], alpha=0.05)


print(tukey)

#conclusion: all means are different -> no significant results 

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
    -1      0  -0.0029   0.0 -0.0033 -0.0024   True
    -1      1  -0.0015   0.0  -0.002  -0.001   True
     0      1   0.0014   0.0  0.0011  0.0018   True
---------------------------------------------------


In [5]:
#perform a t-test to see if positive vs negative is significant 
t_statistic, p_value = stats.ttest_ind(positive_popularity, negative_popularity, equal_var=False)  # Use equal_var=False for Welch's t-test
print(p_value)

2.723084834479269e-07


In [6]:
#the poplarutiy score based on comments is the sum of the positive and negative comments / total comments,
# assuming that neutral comments dont add mcuh to the popularity of the post


submissions_df[submissions_df['num_comments'] > 0]

submissions_df['comment_based_popularity_score'] = (
    submissions_df['positive_count'] + submissions_df['negative_count']
) 
#/ submissions_df['num_comments']


#submissions_df['comment_based_popularity_score'].fillna(0, inplace=True)



In [7]:

#get the three groups
positive_popularity = submissions_df[submissions_df['sentiment'] == 1]['comment_based_popularity_score']
neutral_popularity = submissions_df[submissions_df['sentiment'] == 0]['comment_based_popularity_score']
negative_popularity = submissions_df[submissions_df['sentiment'] == -1]['comment_based_popularity_score']

#we can assume normally distributed data based on teh number of samples here 

print("Number of positive samples:", positive_popularity.count())
print("Number of neutral samples:", neutral_popularity.count())
print("Number of negative samples:", negative_popularity.count())

print("\n\n")

print("Result of ANOVA test: \n\n The p value is: ")

print(stats.f_oneway(positive_popularity, neutral_popularity, negative_popularity).pvalue)
#slightly better but not that much better than the popularity score based on the upvotes-downvotes(they are essentially the same)




Number of positive samples: 22136
Number of neutral samples: 58305
Number of negative samples: 11990



Result of ANOVA test: 

 The p value is: 
2.0882114867422816e-58
