In [1]:
import pandas as pd

In [2]:
def add_comment_sentiment_counts(comments_df, submissions_df):
#given the comments and submissions dataframes, this function will split up the comments
# by their link_id (the submission id) and count the number of positive, neutral and negative comments 


#sum the number of positive/netural/negative commetns for each unique link_id 

    comment_counts_df = comments_df.groupby("link_id").agg(
        positive_count=('sentiment', lambda x: (x == 1).sum()),
        negative_count=('sentiment', lambda x: (x == -1).sum()),
        neutral_count=('sentiment', lambda x: (x == 0).sum())
    ).reset_index()
    
#remove  the "t3_" from link_id in the comments_df
    comments_df['link_id'] = comments_df['link_id'].str.replace('t3_', '', regex=False)
    
#rename "link_id" to "id" in comment_counts_df so we can merge the dataframes on "id"

    comment_counts_df.rename(columns={"link_id": "id"}, inplace=True)
    
#merge the dataframes on "id"
    result_df = submissions_df.merge(comment_counts_df, on="id", how="left")
    

    result_df.fillna({'positive_count': 0, 'negative_count': 0, 'neutral_count': 0}, inplace=True)
    
    return result_df


In [3]:

# pivots the sentiment counts into seperate columns for each sentiment fill missing values with 0 
def aggregate_sentiment_counts(comments_df):
    sentiment_counts = comments_df.groupby(['link_id', 'sentiment']).size().unstack(fill_value=0)
    sentiment_counts.columns = ['negative_count', 'neutral_count', 'positive_count'] 
    sentiment_counts.reset_index(inplace = True)
    return sentiment_counts 


In [4]:
#adjust these boundaries as needed 

def round_sentiment(score):
    if score > 0.25:
        return 1 
    elif score < -0.25:
        return -1 
    else:
        return 0 


In [5]:
comments_df = pd.read_csv("comments.csv", lineterminator='\n')
submissions_df = pd.read_csv("submissions.csv", lineterminator='\n')

#round the sentiments to -1, 0 or 1 
comments_df['sentiment'] = comments_df['sentiment_nltk'].apply(round_sentiment)

#add the comment sentiment counts to the submissions dataframe 

submissions_df = add_comment_sentiment_counts(comments_df, submissions_df)

#group comments by link_id and count the number of each sentiment
submissions = aggregate_sentiment_counts(comments_df)

#match the column names for merging with submissions_df
submissions.rename(columns={'link_id': 'id'}, inplace=True)

#join with submissions_df
submissions = submissions_df.merge(submissions, on = "id", how = "left")


In [6]:
#drop unnecessary columns added by the merge 
submissions = submissions.drop(columns=['positive_count_x', 'neutral_count_x', 'negative_count_x'])
submissions = submissions.rename(columns={
    'positive_count_y': 'positive_comment_count',
    'neutral_count_y': 'neutral_comment_count',
    'negative_count_y': 'negative_comment_count'
}).fillna(0)
submissions[['positive_comment_count', 'neutral_comment_count', 'negative_comment_count']]= submissions[['positive_comment_count', 'neutral_comment_count', 'negative_comment_count']].fillna(0)