In [108]:
import json
import pandas as pd

In [109]:
df = pd.read_parquet("data/gold/granular-opinions-2023.parquet")

In [110]:
aspect_frequency = df['aspect'].value_counts()
threshold_aspect_frequency = aspect_frequency.quantile(0.80)

df = df[df['aspect'].isin(aspect_frequency[aspect_frequency > threshold_aspect_frequency].index)]

df["sentiment_score"] = df["sentiment"].map(
    {
        "positive": 1,
        "negative": -1,
        "neutral": 0
    }
)

In [111]:
df[["quarter", "votes", "agreement", "aspect", "opinion", "sentiment", "category", "sentiment_score"]]

Unnamed: 0,quarter,votes,agreement,aspect,opinion,sentiment,category,sentiment_score
3,2023Q1,3,1.000000,product,great,positive,product quality,1
4,2023Q1,3,1.000000,delivery,late,negative,delivery,-1
5,2023Q2,2,0.666667,delivery,fast,positive,delivery,1
6,2023Q2,2,0.666667,product,good,positive,product quality,1
7,2023Q3,3,1.000000,product,great,positive,product quality,1
...,...,...,...,...,...,...,...,...
1307,2023Q1,3,1.000000,delivery,late,negative,delivery,-1
1310,2023Q3,2,0.666667,delivery,fast,positive,delivery,1
1311,2023Q3,2,0.666667,product,good,positive,product quality,1
1314,2023Q1,2,0.666667,delivery,fast,positive,delivery,1


In [132]:
def calculate_sentiment_metrics(group):
    total_opinions = len(group)
    positive_sentiment = (group['sentiment'] == 'positive').sum()
    negative_sentiment = (group['sentiment'] == 'negative').sum()
    neutral_sentiment = (group['sentiment'] == 'neutral').sum()
    average_sentiment_score = group['sentiment_score'].mean()

    return {
        'number of opinions': total_opinions,
        'percentage positive sentiment': round(positive_sentiment / total_opinions, 2),
        'percentage negative sentiment': round(negative_sentiment / total_opinions, 2),
        'percentage neutral sentiment': round(neutral_sentiment / total_opinions, 2),
        'average sentiment score': round(average_sentiment_score, 2)
    }

def calculate_opinion_metrics(group):
    opinion_counts = group['opinion'].value_counts()
    total_opinions = len(group)
    
    result = {
        'number of distinct opinions': opinion_counts.count(),
    }
    
    # get the top three opinions if they exist
    top_opinions = opinion_counts.head(3)
    for i, (opinion, count) in enumerate(top_opinions.items(), start=1):
        if pd.isnull(opinion) or pd.isnull(count):
            break
        result[f'top{i} opinion'] = opinion
        result[f'top{i} opinion count'] = count
        result[f'top{i} opinion percentage'] = round(count / total_opinions, 2)
    return result

# calculate statistics
stats = {}
for key in [
        [],  # nothing
        ['quarter'],
        ['category'],
        ['quarter', 'category'],
        ['category', 'aspect'],
        ['quarter', 'category', 'aspect']
    ]:
    
    if len(key) > 0:
        group = df.groupby(key)
        metrics = group.apply(calculate_sentiment_metrics, include_groups=False).reset_index(drop=False)
        metrics = pd.concat([metrics.drop(columns=0), pd.json_normalize(metrics[0])], axis=1)
        stats[f'statistics {" ".join(key)}'] = metrics.to_dict(orient='records')
    else:
        metrics = pd.DataFrame(calculate_sentiment_metrics(df), index=[0])  # Treat as a single group
        stats['statistics all'] = metrics.to_dict(orient='records')
        
    
for key in [
        ['quarter', 'category'],
        ['category', 'aspect', 'sentiment'],
        ['quarter', 'category', 'aspect', 'sentiment']
    ]:
    if len(key) > 0:
        group = df.groupby(key)
        metrics = group.apply(calculate_opinion_metrics, include_groups=False).reset_index(drop=False)
        metrics = pd.concat([metrics.drop(columns=0), pd.json_normalize(metrics[0])], axis=1)
        stats[f'opinion statistics {" ".join(key)}'] = metrics.to_dict(orient='records')


In [133]:
with open("data/gold/stats-2023.json", "w") as json_file:
    json.dump(stats, json_file, indent=4)  # indent for pretty printing