In [8]:
import pandas as pd
import os

In [28]:
def add_comment_sentiment_counts(comments_df, submissions_df):
#given the comments and submissions dataframes, this function will split up the comments
# by their link_id (the submission id) and count the number of positive, neutral and negative comments 


#sum the number of positive/netural/negative commetns for each unique link_id 

    comment_counts_df = comments_df.groupby("link_id").agg(
        positive_count=('sentiment_rounded', lambda x: (x == 1).sum()),
        negative_count=('sentiment_rounded', lambda x: (x == -1).sum()),
        neutral_count=('sentiment_rounded', lambda x: (x == 0).sum())
    ).reset_index()
    
#remove  the "t3_" from link_id in the comments_df
    comments_df['link_id'] = comments_df['link_id'].str.replace('t3_', '', regex=False)
    
#rename "link_id" to "id" in comment_counts_df so we can merge the dataframes on "id"

    comment_counts_df.rename(columns={"link_id": "id"}, inplace=True)
    
#merge the dataframes on "id"
    result_df = submissions_df.merge(comment_counts_df, on="id", how="left")
    

    result_df.fillna({'positive_count': 0, 'negative_count': 0, 'neutral_count': 0}, inplace=True)
    
    return result_df


In [29]:

# pivots the sentiment counts into seperate columns for each sentiment fill missing values with 0 
def aggregate_sentiment_counts(comments_df):
    sentiment_counts = comments_df.groupby(['link_id', 'sentiment_rounded']).size().unstack(fill_value=0)
    sentiment_counts.columns = ['negative_count', 'neutral_count', 'positive_count'] 
    sentiment_counts.reset_index(inplace = True)
    return sentiment_counts 


In [30]:
#adjust these boundaries as needed 

def round_sentiment(score):
    if score > 0.25:
        return 1 
    elif score < -0.25:
        return -1 
    else:
        return 0 
#function to read multiple csv files in a directorty and load them into one single dataframe
def load_csv_files(directory):
    data_frames = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            try:
                df = pd.read_csv(file_path, on_bad_lines='skip')  #avoid errors while reading 
                data_frames.append(df)
            except pd.errors.ParserError as e:
                print(f"Error reading {file_path}: {e}")
    return pd.concat(data_frames, ignore_index=True)




In [32]:
#comments_df = pd.read_csv("../project/cmpt353-project/data/comments.csv", lineterminator='\n')
#submissions_df = pd.read_csv("../project/cmpt353-project/data/submissions.csv", lineterminator='\n')
submissions_df= load_csv_files('submissions_cleaned')
comments_df = load_csv_files('comments_cleaned')

#round the sentiments to -1, 0 or 1 

comments_df['sentiment_score'] = pd.to_numeric(comments_df['sentiment_score'], errors='coerce')
submissions_df['sentiment_score'] = pd.to_numeric(submissions_df['sentiment_score'], errors='coerce')


comments_df['sentiment_rounded'] = comments_df['sentiment_score'].apply(round_sentiment)
submissions_df['sentiment_rounded'] = submissions_df['sentiment_score'].apply(round_sentiment)
#add the comment sentiment counts to the submissions dataframe 

submissions_df = add_comment_sentiment_counts(comments_df, submissions_df)

#group comments by link_id and count the number of each sentiment
submissions = aggregate_sentiment_counts(comments_df)

#match the column names for merging with submissions_df
submissions.rename(columns={'link_id': 'id'}, inplace=True)

#join with submissions_df
submissions = submissions_df.merge(submissions, on = "id", how = "left")


In [33]:
#drop unnecessary columns added by the merge 
submissions = submissions.drop(columns=['positive_count_x', 'neutral_count_x', 'negative_count_x'])
submissions = submissions.rename(columns={
    'positive_count_y': 'positive_comment_count',
    'neutral_count_y': 'neutral_comment_count',
    'negative_count_y': 'negative_comment_count'
}).fillna(0)
submissions[['positive_comment_count', 'neutral_comment_count', 'negative_comment_count']]= submissions[['positive_comment_count', 'neutral_comment_count', 'negative_comment_count']].fillna(0)

In [34]:
#funciont ot add a popularity score column 

def add_popularity_score(submissions_df):
    
#the total popularity is the sum of the comments and score 


    submissions_df['total_popularity'] = submissions_df['num_comments'] + submissions_df['score']
    
#get the average popularity for each subreddit


    subreddit_avg_popularity = submissions_df.groupby('subreddit')['total_popularity'].mean().reset_index()
    subreddit_avg_popularity.rename(columns={'total_popularity': 'avg_popularity'}, inplace=True)
    
#left join to get all the entries from submissions_df that match the avg popularity values 

    submissions_df = submissions_df.merge(subreddit_avg_popularity, on='subreddit', how='left')
    
#calculate popularity score for each post: total popularity/ average popularity for that subreddit 
    submissions_df['popularity_score'] = submissions_df['total_popularity'] / submissions_df['avg_popularity']
    
#drop the uneeded columns 
    submissions_df.drop(columns=['total_popularity', 'avg_popularity'], inplace=True)
    
    return submissions_df

def scale_popularity_scores(submissions_df):
#used to scale popularity bteween 0 and 1 
    min_score = submissions_df['popularity_score'].min()
    max_score = submissions_df['popularity_score'].max()
    
#if all scores all the same, min_score-max_score = 0 and we'd be dividing by 0
    if min_score == max_score:
        submissions_df['scaled_popularity_score'] = 0.0
    else:
#if not all scores are the same, thenthe scaled popularity is the (score -min_score)/ (max-min scores)
        submissions_df['scaled_popularity_score'] = (
            submissions_df['popularity_score'] - min_score
        ) / (max_score - min_score)
    
    return submissions_df



In [37]:

submissions_df['positive_count']=submissions['positive_comment_count']
submissions_df['negative_count']=submissions['negative_comment_count']

#save to csv 
submissions_df.to_csv('submissions_cleaned_linked.csv', index=False)
comments_df.to_csv('comments_cleaned_linked.csv', index = False)