In [1]:
import duckdb as ddb
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.gridspec as gridspec

data = ddb.read_parquet("all_reviews/all_reviews.parquet")

In [None]:
# columns and types
for x, item in enumerate(data.columns):
    print(str(item) + ":  " + str(data.dtypes[x]))

game:  VARCHAR
author_steamid:  BIGINT
author_num_games_owned:  INTEGER
author_num_reviews:  INTEGER
author_playtime_forever:  INTEGER
author_playtime_last_two_weeks:  INTEGER
author_playtime_at_review:  BIGINT
author_last_played:  INTEGER
language:  VARCHAR
review:  VARCHAR
timestamp_created:  INTEGER
timestamp_updated:  INTEGER
voted_up:  INTEGER
votes_up:  INTEGER
votes_funny:  INTEGER
weighted_vote_score:  FLOAT
comment_count:  INTEGER
steam_purchase:  INTEGER
received_for_free:  INTEGER
written_during_early_access:  INTEGER


In [None]:
# print(data.describe())


['game', 'author_steamid', 'author_num_games_owned', 'author_num_reviews', 'author_playtime_forever', 'author_playtime_last_two_weeks', 'author_playtime_at_review', 'author_last_played', 'language', 'review', 'timestamp_created', 'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score', 'comment_count', 'steam_purchase', 'received_for_free', 'written_during_early_access']
┌─────────┬─────────────────────────────┬───────────────────────┬────────────────────────┬────────────────────┬─────────────────────────┬────────────────────────────────┬───────────────────────────┬────────────────────┬────────────┬───────────────────────────────────────────────┬────────────────────┬────────────────────┬───────────────────┬────────────────────┬────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────────┐
│  aggr   │            game             │    author_steamid     │ author_num_games_owned │ author

In [29]:
# IDEA:
# review scores from those who got the game for free vs those who had to pay for it
# is there an average of a higher rating for free / not purchased? for what games?

# plan: 
#   gather the top 3 games the most amount of reviews
#   find out the percentage of positive vs negative reviews
#   then find out the percentages from those who puchased vs did not purchase the game (include those who got the game for free)
#   analyze which games had which results, and do some further research on why this may be
#       this could be finding the reviews with the highest helpfulness or most upvotes
#       this could also include doing some outside reseach on reddit / internet

# gather the top 3 games the most amount of reviews
# most_reviewed_games = ddb.sql(f"""
#     SELECT game, COUNT(*) as review_count
#     FROM data
#     GROUP BY game
#     ORDER BY review_count DESC
#     LIMIT 3
# """).fetchdf()

# find games with middle of the road reviews, grab the top 3 in review count
mid_reviewed_games = ddb.sql(f"""
    SELECT
        game,
        COUNT(*) AS review_count,
        AVG(CASE WHEN voted_up = 1 THEN 1.0 ELSE 0 END) * 100 AS average_positive_percentage
    FROM data
    GROUP BY game
    HAVING 
        average_positive_percentage BETWEEN 40 AND 60
        AND review_count > 100
    ORDER BY review_count DESC
    LIMIT 10;
""").fetchdf()

print(mid_reviewed_games)

                         game  review_count  average_positive_percentage
0               Call of Duty®        451506                    59.627779
1           Battlefield™ 2042        227879                    40.113832
2       鬼谷八荒 Tale of Immortal        219306                    51.709028
3          EA SPORTS™ FIFA 23        141619                    55.113368
4  Warhammer 40,000: Darktide         77394                    59.741065
5           雀魂麻将(MahjongSoul)         47821                    54.977939
6                    PAYDAY 3         42514                    40.342005
7           theHunter Classic         40053                    56.530098
8                    NBA 2K22         38302                    58.109237
9             Bless Unleashed         30108                    52.105753


In [None]:
# find out the percentages from those who puchased vs did not purchase the game (include those who got the game for free)
# need all the reviews where they purchased the game AND did not get it for free, then take the percentages of those reviews
# some columns to consider:
#   steam_purchase:  INTEGER
#   received_for_free:  INTEGER
purchased_review_percentages = ddb.sql(f"""
    SELECT
        game,
        SUM(CASE WHEN voted_up = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS positive_review_percentage
    FROM data
    WHERE 
        game IN (SELECT game FROM mid_reviewed_games)
        AND steam_purchase = 1
        AND received_for_free != 1
    GROUP BY game
    ORDER BY positive_review_percentage DESC;
""").fetchdf()

print(purchased_review_percentages)

                         game  positive_review_percentage
0             Bless Unleashed                   62.376238
1  Warhammer 40,000: Darktide                   60.306816
2                    NBA 2K22                   57.797093
3               Call of Duty®                   56.404705
4          EA SPORTS™ FIFA 23                   54.869121
5       鬼谷八荒 Tale of Immortal                   52.974246
6           theHunter Classic                   47.643979
7           Battlefield™ 2042                   42.269388
8                    PAYDAY 3                   40.008711


In [25]:
# find out the percentage of positive vs negative reviews
# some columns to consider:
#   voted_up:  INTEGER
# top_game_review_percentages = ddb.sql(f"""
#     SELECT
#         game,
#         SUM(CASE WHEN voted_up = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS positive_review_percentage
#     FROM data
#     WHERE game IN (SELECT game FROM most_reviewed_games)
#     GROUP BY game
#     ORDER BY positive_review_percentage DESC;
# """).fetchdf()

# print(top_game_review_percentages)

                    game  positive_review_percentage
0          Call of Duty®                   59.627779
1  鬼谷八荒 Tale of Immortal                   51.709028
2      Battlefield™ 2042                   40.113832
