In [1]:
import numpy as np
import pandas as pd
import types
import zipfile
import io
from tqdm.notebook import tqdm_notebook
import json
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())
pd.set_option('display.float_format', lambda x: '%.3f' % x)


In [2]:
# reading the CSV files
df=pd.read_csv("../data/raw_data/FPS_reviews.csv.zip",low_memory=False)
df_final=(df.drop_duplicates(["recommendation_id","game_id"])).reset_index(drop=True)

In [3]:
# getting the number of reviews with duplicates
game_length=len(set(list(df["game_id"])))
print(f"{len(df)} reviews across {game_length} games")

3637833 reviews across 2304 games


In [4]:
print("Total duplicates are",len(df)-len(df_final))

Total duplicates are 7117


In [5]:
df=df_final

In [6]:
# review length after duplicates
game_length=len(set(list(df["game_id"])))
print(f"{len(df)} reviews across {game_length} games")

3630716 reviews across 2304 games


In [7]:
# getting some characteristic data
df["review"]=df["review"].astype(str)
df["review_words_length"]=df["review"].apply(lambda x: len(x.split(" ")))
df["review_words_vocab"]=df["review"].apply(lambda x: len(set(x.split(" "))))

In [8]:
df[["review_words_length","review_words_vocab","votes_up"]].describe()

Unnamed: 0,review_words_length,review_words_vocab,votes_up
count,3630716.0,3630716.0,3630716.0
mean,50.725,35.57,5.277
std,117.802,64.786,72.539
min,1.0,1.0,0.0
25%,4.0,4.0,0.0
50%,13.0,12.0,1.0
75%,44.0,38.0,2.0
max,6947.0,1333.0,36237.0


In [9]:
df[["review_words_length","review_words_vocab","votes_up"]].median()

review_words_length   13.000
review_words_vocab    12.000
votes_up               1.000
dtype: float64

In [10]:
df["voted_up"].value_counts()

True     2996761
False     633955
Name: voted_up, dtype: int64

In [11]:
# Counting NA values in the 'voted_up' column
na_count = df['voted_up'].isna().sum()
print(f"Number of NA values in 'voted_up': {na_count}")

Number of NA values in 'voted_up': 0


In [12]:
# getting other characteristic data not in the dataframe
raw_review_location = "../data/raw_data/" + 'reviews.txt.zip'
with zipfile.ZipFile(raw_review_location) as zf:
    with io.TextIOWrapper(zf.open('reviews.txt'), encoding='utf-8') as f:
        all_reviews_raw = f.readlines()
        pass
    pass
all_reviews = [json.loads(all_reviews_raw[xxx]) for xxx in tqdm_notebook(range(len(all_reviews_raw)))]

  0%|          | 0/6498 [00:00<?, ?it/s]

In [13]:

game_id, player_recommendtion_id_list, written_during_early_access_list = [], [], []
playtime_at_review_list, received_for_free_list,hidden_in_steam_china = [], [], []
for i in tqdm_notebook(range(len(all_reviews))):
    game_reviews = all_reviews[i]
    if len(game_reviews) > 0:
        for j in ((game_reviews)):
            review = j
            player_recommendtion_id_list.append(review["recommendationid"])
            hidden_in_steam_china.append(review["hidden_in_steam_china"])
            written_during_early_access_list.append(review["written_during_early_access"])
            received_for_free_list.append(review["received_for_free"])
            try:
                playtime_at_review_list.append(review["playtime_at_review"])
            except:
                playtime_at_review_list.append(0)
            game_id.append(review['game_id'])
        pass
    pass

df_other_characteristics = pd.DataFrame(
    list(zip(player_recommendtion_id_list, game_id, playtime_at_review_list,
             hidden_in_steam_china, written_during_early_access_list,
             received_for_free_list)),
    columns=['recommendation_id', 'game_id', 'playtime_at_review',
             'hidden_in_steam_china', 'written_during_early_access', 'received_for_free']
)
data_location="../data/supplemental_data/"
df_other_characteristics=(df_other_characteristics.drop_duplicates(["recommendation_id","game_id"])).reset_index(drop=True)
print(len(df_other_characteristics)==len(df))
df_other_characteristics.to_csv(f"{data_location}data_other_characteristics.csv.zip", index=False)

  0%|          | 0/6498 [00:00<?, ?it/s]

True


In [14]:
df_other_characteristics[["playtime_at_review"]].describe()/60

Unnamed: 0,playtime_at_review
count,60511.933
mean,60.476
std,297.394
min,0.0
25%,4.433
50%,11.917
75%,32.65
max,61516.583


In [16]:
if df_other_characteristics["hidden_in_steam_china"].value_counts().sum()==len(df_other_characteristics):
    print(df_other_characteristics["hidden_in_steam_china"].value_counts())
    print(df_other_characteristics["hidden_in_steam_china"].value_counts()/len(df_other_characteristics))
else:
    print("values missing")
#     print(df_other_characteristics["hidden_in_steam_china"].value_counts()/len(df_other_characteristics))
#     print((len(df_other_characteristics)-df_other_characteristics["hidden_in_steam_china"].value_counts().sum)/len(df_other_characteristics))

False    2599618
True     1031098
Name: hidden_in_steam_china, dtype: int64
False   0.716
True    0.284
Name: hidden_in_steam_china, dtype: float64


In [17]:
if df_other_characteristics["received_for_free"].value_counts().sum()==len(df_other_characteristics):
    print(df_other_characteristics["received_for_free"].value_counts())
    print(df_other_characteristics["received_for_free"].value_counts()/len(df_other_characteristics))

False    3524758
True      105958
Name: received_for_free, dtype: int64
False   0.971
True    0.029
Name: received_for_free, dtype: float64


In [18]:
if df_other_characteristics["written_during_early_access"].value_counts().sum()==len(df_other_characteristics):
    print(df_other_characteristics["written_during_early_access"].value_counts())
    print(df_other_characteristics["written_during_early_access"].value_counts()/len(df_other_characteristics))

False    3177973
True      452743
Name: written_during_early_access, dtype: int64
False   0.875
True    0.125
Name: written_during_early_access, dtype: float64


In [19]:
row_counts = df.groupby('game_id').size()
average = row_counts.mean()
std_dev = row_counts.std()
median = row_counts.median()
print(f"Mean: {average}, Mdn:{median}, sd: {std_dev}")

Mean: 1575.8315972222222, Mdn:24.0, sd: 6290.845429738996
