In [1]:
import pandas as pd
import gzip
import datetime

In [2]:
interactions = pd.read_csv('/kaggle/input/game-recommendations-on-steam/recommendations.csv')

In [3]:
interactions.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.3,49625,0
1,304390,4,0,2017-02-17,False,11.5,2482,1
2,1085660,2,0,2019-11-17,True,336.5,243409,2
3,703080,0,0,2022-09-23,True,27.4,248701,3
4,526870,0,0,2021-01-10,True,7.9,22902,4


In [4]:
df1 = interactions[['user_id', 'app_id', 'is_recommended', 'hours', 'date']]
df2 = df1.rename(columns = {'user_id': 'userID', 'app_id': 'gameID', 'is_recommended': 'recommended',
                                                'hours': 'hours_played', 'date': 'review_date'})
df2[['userID', 'gameID', 'review_date']] = df2[['userID', 'gameID', 'review_date']].astype('string')
# df2['review_date'] = pd.to_datetime(df2['review_date'], format = '%Y-%m-%d')

In [5]:
df3 = df2.groupby('recommended').sample(frac = 0.08, random_state = 123)

In [6]:
df3.reset_index(drop = True, inplace = True)
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3068328 entries, 0 to 3068327
Data columns (total 5 columns):
 #   Column        Dtype  
---  ------        -----  
 0   userID        string 
 1   gameID        string 
 2   recommended   bool   
 3   hours_played  float64
 4   review_date   string 
dtypes: bool(1), float64(1), string(3)
memory usage: 96.6 MB


In [7]:
def reduce_sparsity(df, min_items_per_user, min_user_per_item, user_col, item_col):
    good_users = df[user_col].value_counts()[df[user_col].value_counts() > min_items_per_user].index
    df = df[df[user_col].isin(good_users)]

    good_items = df[item_col].value_counts()[df[item_col].value_counts() > min_user_per_item].index
    df = df[df[item_col].isin(good_items)].reset_index(drop=1)
    return df

In [8]:
df4 = reduce_sparsity(df3, 5, 5, 'userID', 'gameID')

In [9]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185503 entries, 0 to 185502
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   userID        185503 non-null  string 
 1   gameID        185503 non-null  string 
 2   recommended   185503 non-null  bool   
 3   hours_played  185503 non-null  float64
 4   review_date   185503 non-null  string 
dtypes: bool(1), float64(1), string(3)
memory usage: 5.8 MB


In [10]:
df4.head()

Unnamed: 0,userID,gameID,recommended,hours_played,review_date
0,1393010,1238840,False,301.7,2021-10-15
1,11356686,485440,False,2.4,2018-11-19
2,7960921,326180,False,6.2,2020-06-24
3,11879917,298600,False,4.2,2020-07-11
4,10867748,251150,False,35.1,2022-12-30


In [11]:
df4.to_json('dataset.json.gz', orient = 'records', compression = 'gzip')