In [1]:
import pandas as pd
import numpy as np

import datetime


In [2]:
INPUT_FILE = "../data/raw/products.parquet"

In [3]:
in_data = pd.read_parquet(INPUT_FILE)

data = in_data[
    [   
        "name",
        "id",
        "release_date",
        "platforms",
        "subscribers_count",
        "trophies.platinum",
        "float_price",
        "last_update_date",
        "last_update.end_date",
        "last_update.discount_percent",
        "scores.metacritic.score",
        "scores.opencritic.score",
    ]
].copy()


In [4]:
data.head()

Unnamed: 0,name,id,release_date,platforms,subscribers_count,trophies.platinum,float_price,last_update_date,last_update.end_date,last_update.discount_percent,scores.metacritic.score,scores.opencritic.score
0,Moss: Libro II,5154444,,['PS4'],16,False,39.99,2022-03-31T14:02:08+03:00,,0,,
1,Elden Ring PS4 & PS5,4928471,,"['PS4', 'PS5']",51,True,69.99,2021-11-04T21:40:31+03:00,,0,,
2,Chrono Cross: The Radical Dreamers Edition,5198377,2022-04-07T03:00:00+03:00,['PS4'],12,False,19.99,2022-04-07T05:00:53+03:00,,0,,
3,Lego Star Wars: La Saga Skywalker PS4 & PS5,5048054,,"['PS4', 'PS5']",13,True,59.99,2022-01-20T19:46:16+03:00,,0,,
4,Ghostwire: Tokyo,5065097,,['PS5'],13,True,69.99,2022-03-25T04:59:18+03:00,,0,,


In [5]:
missing_mc_percentage = data["scores.metacritic.score"].isna().sum() / data.shape[0] * 100
missing_oc_percentage = data["scores.opencritic.score"].isna().sum() / data.shape[0] * 100

print(f"{missing_mc_percentage:.5f}% of metacritic scores missing")
print(f"{missing_oc_percentage:.5f}% of opencritic scores missing")

99.56728% of metacritic scores missing
94.52820% of opencritic scores missing


Steps to transform the dataset and make it suitable for clustering algorithms:
- release_date as int or timestamp
- dummify platforms
- encode trophies.platinum
- last_update_date as time from current_date
- update_duration = last_update.end_date - last_update_date

In [6]:
# Convert to date dtypes
date_columns = data.columns[data.columns.str.contains("_date")]

for c in date_columns:
    data[c] = pd.to_datetime(data[c], utc=True).dt.date

In [7]:
def date_to_days_since(x):
    if x is pd.NaT:
        return np.nan
    else:
        return (datetime.date.today() - x).days

In [8]:
print(f'{data["release_date"].isna().sum() / data.shape[0] * 100}% missing release_date')
print(f'{data["last_update_date"].isna().sum() / data.shape[0] * 100}% missing last_update_date')

10.231714126186487% missing release_date
0.5583472920156337% missing last_update_date


In [9]:
data["days_since_release"] = [date_to_days_since(x) for x in data["release_date"]]
data["days_since_last_update"] = [date_to_days_since(x) for x in data["last_update_date"]]

In [10]:
data["platforms"].unique()

array(["['PS4']", "['PS4', 'PS5']", "['PS5']", "['PS4', 'PSVita']",
       "['PS5', 'PS4']", "['PS4', 'PS3']", "['PS4', 'PS3', 'PSVita']",
       "['PS4', 'PS3', 'PSVita', 'PSP']"], dtype=object)

In [11]:
data["is_ps5"] = [1 if "PS5" in x else 0 for x in data["platforms"]]
data["is_ps4"] = [1 if "PS4" in x else 0 for x in data["platforms"]]
data["is_ps3"] = [1 if "PS3" in x else 0 for x in data["platforms"]]
data["is_psv"] = [1 if "PSVita" in x else 0 for x in data["platforms"]]

data["platforms_count"] = [len(x) for x in data["platforms"]]

In [12]:
data["has_platinum"] = data["trophies.platinum"].astype(int)

In [13]:
# Prepare the final dataset
outfile = data[[
    "id",
    "subscribers_count",
    "float_price",
    "days_since_release",
    "days_since_last_update",
    "is_ps5",
    "is_ps4",
    "is_ps3",
    "is_psv",
    "platforms_count",
    "has_platinum",
]]
outfile.head()


Unnamed: 0,id,subscribers_count,float_price,days_since_release,days_since_last_update,is_ps5,is_ps4,is_ps3,is_psv,platforms_count,has_platinum
0,5154444,16,39.99,,353.0,0,1,0,0,7,0
1,4928471,51,69.99,,500.0,1,1,0,0,14,1
2,5198377,12,19.99,346.0,346.0,0,1,0,0,7,0
3,5048054,13,59.99,,423.0,1,1,0,0,14,1
4,5065097,13,69.99,,359.0,1,0,0,0,7,1


In [14]:
# If there are NaNs in the final output, drop them
# outfile = outfile.dropna()
# outfile.isna().sum()

In [15]:
outfile.to_parquet("../data/processed/products_clustering.parquet")