In [1]:
from collections import Counter

import pandas as pd

### Item Categories

In [2]:
with open("steam_games.json", "r") as f:
    game_metadata = []
    none_count = 0
    for line in f:
        game_data = eval(line)
        if "id" not in game_data or "title" not in game_data or "genres" not in game_data or "tags" not in game_data:
            none_count += 1
            continue
        game_id = int(game_data["id"])
        title = game_data["title"]
        genres = game_data["genres"]
        tags = game_data["tags"]
        game_metadata.append((game_id, title, genres, tags))
game_metadata[0], none_count

((761140,
  'Lost Summoner Kitty',
  ['Action', 'Casual', 'Indie', 'Simulation', 'Strategy'],
  ['Strategy', 'Action', 'Indie', 'Casual', 'Simulation']),
 3309)

In [3]:
# Let's see if there are duplicate ids
unique_ids = set([gm[0] for gm in game_metadata])
print(len(unique_ids) == len(game_metadata))
print(len(unique_ids), len(game_metadata))

False
28825 28826


In [4]:
count = Counter([gm[0] for gm in game_metadata])
duplicates = [item for item, freq in count.items() if freq > 1]
duplicates

[612880]

In [5]:
for e in game_metadata:
    if e[0] == 612880:
        print(e)

(612880, 'Wolfenstein II: The New Colossus', ['Action'], ['Action', 'FPS', 'Gore', 'Violent', 'Alternate History', 'Singleplayer', 'First-Person', 'Shooter', 'Story Rich', 'World War II', 'Nudity', 'Atmospheric', 'Comedy', 'Sci-fi', 'Adventure', 'Stealth', 'Dystopian', 'Great Soundtrack', 'Illuminati', 'Open World'])
(612880, 'Wolfenstein II: The New Colossus', ['Action'], ['Action', 'FPS', 'Gore', 'Violent', 'Alternate History', 'Singleplayer', 'First-Person', 'Shooter', 'Story Rich', 'World War II', 'Nudity', 'Atmospheric', 'Comedy', 'Sci-fi', 'Adventure', 'Stealth', 'Dystopian', 'Great Soundtrack', 'Illuminati', 'Open World'])


In [6]:
game_metadata = [
    (game_id, title, "_".join(sorted(genres)), "_".join(sorted(tags)))
    for game_id, title, genres, tags in game_metadata
]
game_metadata[:3]

[(761140,
  'Lost Summoner Kitty',
  'Action_Casual_Indie_Simulation_Strategy',
  'Action_Casual_Indie_Simulation_Strategy'),
 (643980,
  'Ironbound',
  'Free to Play_Indie_RPG_Strategy',
  '2D_Board Game_Card Game_Character Customization_Competitive_Dark Fantasy_Design & Illustration_Difficult_Fantasy_Female Protagonist_Free to Play_Indie_PvP_RPG_Replay Value_Strategy_Tactical_Trading Card Game_Turn-Based'),
 (670290,
  'Real Pool 3D - Poolians',
  'Casual_Free to Play_Indie_Simulation_Sports',
  'Casual_Free to Play_Indie_Multiplayer_Simulation_Sports')]

In [7]:
unique_cats = set([gm[2] for gm in game_metadata])
len(unique_cats)

841

In [8]:
unique_tags = set([gm[3] for gm in game_metadata])
len(unique_tags)

11773

In [9]:
cat_df = pd.DataFrame(game_metadata, columns=["ItemId", "Name", "Category", "Tag"])
cat_df

Unnamed: 0,ItemId,Name,Category,Tag
0,761140,Lost Summoner Kitty,Action_Casual_Indie_Simulation_Strategy,Action_Casual_Indie_Simulation_Strategy
1,643980,Ironbound,Free to Play_Indie_RPG_Strategy,2D_Board Game_Card Game_Character Customizatio...
2,670290,Real Pool 3D - Poolians,Casual_Free to Play_Indie_Simulation_Sports,Casual_Free to Play_Indie_Multiplayer_Simulati...
3,767400,弹炸人2222,Action_Adventure_Casual,Action_Adventure_Casual
4,772540,Battle Royale Trainer,Action_Adventure_Simulation,Action_Adventure_FPS_Shooter_Simulation_Sniper...
...,...,...,...,...
28821,745400,Kebab it Up!,Action_Adventure_Casual_Indie,Action_Adventure_Casual_Indie_Violent
28822,773640,Colony On Mars,Casual_Indie_Simulation_Strategy,Casual_Indie_Simulation_Strategy
28823,733530,LOGistICAL: South Africa,Casual_Indie_Strategy,Casual_Indie_Strategy
28824,610660,Russian Roads,Indie_Racing_Simulation,Indie_Racing_Simulation


In [10]:
cat_df = cat_df.drop_duplicates(subset="ItemId")
cat_df

Unnamed: 0,ItemId,Name,Category,Tag
0,761140,Lost Summoner Kitty,Action_Casual_Indie_Simulation_Strategy,Action_Casual_Indie_Simulation_Strategy
1,643980,Ironbound,Free to Play_Indie_RPG_Strategy,2D_Board Game_Card Game_Character Customizatio...
2,670290,Real Pool 3D - Poolians,Casual_Free to Play_Indie_Simulation_Sports,Casual_Free to Play_Indie_Multiplayer_Simulati...
3,767400,弹炸人2222,Action_Adventure_Casual,Action_Adventure_Casual
4,772540,Battle Royale Trainer,Action_Adventure_Simulation,Action_Adventure_FPS_Shooter_Simulation_Sniper...
...,...,...,...,...
28821,745400,Kebab it Up!,Action_Adventure_Casual_Indie,Action_Adventure_Casual_Indie_Violent
28822,773640,Colony On Mars,Casual_Indie_Simulation_Strategy,Casual_Indie_Simulation_Strategy
28823,733530,LOGistICAL: South Africa,Casual_Indie_Strategy,Casual_Indie_Strategy
28824,610660,Russian Roads,Indie_Racing_Simulation,Indie_Racing_Simulation


In [11]:
cat_df.dtypes

ItemId       int64
Name        object
Category    object
Tag         object
dtype: object

### Sessions

In [12]:
with open("steam_reviews.json", "r") as file:
    user_item_interactions = []
    none_count = 0
    for line in file:
        interaction = eval(line)
        if "username" not in interaction or "product_id" not in interaction or "date" not in interaction:
            none_count += 1
        username = interaction["username"]
        product_id = interaction["product_id"]
        date = interaction["date"]
        user_item_interactions.append((username, int(product_id), date))
user_item_interactions[0], none_count

(('Chaos Syren', 725280, '2017-12-17'), 0)

In [13]:
interactions_df = pd.DataFrame(user_item_interactions, columns=["SessionId", "ItemId", "Time"])
interactions_df

Unnamed: 0,SessionId,ItemId,Time
0,Chaos Syren,725280,2017-12-17
1,₮ʜᴇ Wᴀʀᴛᴏɴ,328100,2017-12-27
2,hello?<,328100,2017-10-16
3,Cyderine916,35140,2018-01-04
4,DarklyThinking,35140,2018-01-04
...,...,...,...
7793064,Wildman_,252490,2013-12-11
7793065,Stony,252490,2013-12-11
7793066,Deez Knees,252490,2013-12-11
7793067,Vidaar,252490,2013-12-11


In [14]:
interactions_df.dtypes

SessionId    object
ItemId        int64
Time         object
dtype: object

In [15]:
items_w_metadata = cat_df["ItemId"].unique()
interactions_df = interactions_df[interactions_df["ItemId"].isin(items_w_metadata)]
interactions_df

Unnamed: 0,SessionId,ItemId,Time
0,Chaos Syren,725280,2017-12-17
1,₮ʜᴇ Wᴀʀᴛᴏɴ,328100,2017-12-27
2,hello?<,328100,2017-10-16
3,Cyderine916,35140,2018-01-04
4,DarklyThinking,35140,2018-01-04
...,...,...,...
7793064,Wildman_,252490,2013-12-11
7793065,Stony,252490,2013-12-11
7793066,Deez Knees,252490,2013-12-11
7793067,Vidaar,252490,2013-12-11


In [16]:
interactions_df["Time"] = pd.to_datetime(interactions_df["Time"])
interactions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df["Time"] = pd.to_datetime(interactions_df["Time"])


Unnamed: 0,SessionId,ItemId,Time
0,Chaos Syren,725280,2017-12-17
1,₮ʜᴇ Wᴀʀᴛᴏɴ,328100,2017-12-27
2,hello?<,328100,2017-10-16
3,Cyderine916,35140,2018-01-04
4,DarklyThinking,35140,2018-01-04
...,...,...,...
7793064,Wildman_,252490,2013-12-11
7793065,Stony,252490,2013-12-11
7793066,Deez Knees,252490,2013-12-11
7793067,Vidaar,252490,2013-12-11


In [17]:
interactions_df = interactions_df.sort_values(by=["SessionId", "Time"])
interactions_df

Unnamed: 0,SessionId,ItemId,Time
2697132,!,421670,2016-03-04
2371759,!,377160,2016-03-31
2371834,!,377160,2016-03-31
436692,!,227940,2016-07-21
499895,!,304240,2016-09-15
...,...,...,...
7609170,󰀖Mr.Tarunio,391540,2016-11-23
6135304,󰀗 Lolicage,244630,2014-07-09
1053409,󰀗S-Rabbit󰀗,322330,2017-12-19
4773561,󰀗THE0ERROR󰀗,204300,2017-09-25


In [18]:
interactions_df = interactions_df.drop_duplicates(subset=["SessionId", "ItemId"], keep="first")
interactions_df

Unnamed: 0,SessionId,ItemId,Time
2697132,!,421670,2016-03-04
2371759,!,377160,2016-03-31
436692,!,227940,2016-07-21
499895,!,304240,2016-09-15
2211182,!,550650,2017-10-25
...,...,...,...
7608986,󰀖Mr.Tarunio,391540,2016-11-23
6135304,󰀗 Lolicage,244630,2014-07-09
1053409,󰀗S-Rabbit󰀗,322330,2017-12-19
4773561,󰀗THE0ERROR󰀗,204300,2017-09-25


In [19]:
interactions_df["Time"] = interactions_df["Time"].dt.strftime("%Y-%m-%d 00:00:00.000")
interactions_df["Reward"] = 1.0
interactions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df["Time"] = interactions_df["Time"].dt.strftime("%Y-%m-%d 00:00:00.000")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df["Reward"] = 1.0


Unnamed: 0,SessionId,ItemId,Time,Reward
2697132,!,421670,2016-03-04 00:00:00.000,1.0
2371759,!,377160,2016-03-31 00:00:00.000,1.0
436692,!,227940,2016-07-21 00:00:00.000,1.0
499895,!,304240,2016-09-15 00:00:00.000,1.0
2211182,!,550650,2017-10-25 00:00:00.000,1.0
...,...,...,...,...
7608986,󰀖Mr.Tarunio,391540,2016-11-23 00:00:00.000,1.0
6135304,󰀗 Lolicage,244630,2014-07-09 00:00:00.000,1.0
1053409,󰀗S-Rabbit󰀗,322330,2017-12-19 00:00:00.000,1.0
4773561,󰀗THE0ERROR󰀗,204300,2017-09-25 00:00:00.000,1.0


In [20]:
interactions_df.isna().any().any()

False

In [21]:
interactions_df.dtypes

SessionId     object
ItemId         int64
Time          object
Reward       float64
dtype: object

### Make 5-Core

In [22]:
MIN_NUM_INTERACTIONS = 5

processed_session_df = interactions_df.copy()
prev_len_processed_session_df = len(processed_session_df)
while True: 
    print("Iteration ")

    # Compute sessions to keep.
    sessions_to_keep = {session for session, value_count in processed_session_df["SessionId"].value_counts().items() if value_count >= MIN_NUM_INTERACTIONS}
    processed_session_df = processed_session_df[processed_session_df["SessionId"].isin(sessions_to_keep)]
    
    # Compute items to keep.
    items_to_keep = {item for item, value_count in processed_session_df["ItemId"].value_counts().items() if value_count >= MIN_NUM_INTERACTIONS}
    processed_session_df = processed_session_df[processed_session_df["ItemId"].isin(items_to_keep)]
    
    if prev_len_processed_session_df == len(processed_session_df):
        break

    prev_len_processed_session_df = len(processed_session_df)

print(f"Number of sessions: {len(sessions_to_keep)}")
print(f"Number of items: {len(items_to_keep)}")
sessions_df = processed_session_df
sessions_df

Iteration 
Iteration 
Iteration 
Iteration 
Number of sessions: 279290
Number of items: 11784


Unnamed: 0,SessionId,ItemId,Time,Reward
2697132,!,421670,2016-03-04 00:00:00.000,1.0
2371759,!,377160,2016-03-31 00:00:00.000,1.0
436692,!,227940,2016-07-21 00:00:00.000,1.0
499895,!,304240,2016-09-15 00:00:00.000,1.0
2211182,!,550650,2017-10-25 00:00:00.000,1.0
...,...,...,...,...
1469646,󰀕 Africa's Population 󰀕,242680,2015-03-23 00:00:00.000,1.0
2265710,󰀕 Africa's Population 󰀕,99900,2015-09-16 00:00:00.000,1.0
4269027,󰀕 Africa's Population 󰀕,268910,2017-10-11 00:00:00.000,1.0
5451032,󰀕 Africa's Population 󰀕,252610,2017-11-22 00:00:00.000,1.0


### Encode the Session Data

In [23]:
# Encode session ids
unique_sessions = sessions_df["SessionId"].unique()
session_id_mapping = {value: idx for idx, value in enumerate(unique_sessions)}
sessions_df["SessionId"] = sessions_df["SessionId"].map(session_id_mapping)
sessions_df

Unnamed: 0,SessionId,ItemId,Time,Reward
2697132,0,421670,2016-03-04 00:00:00.000,1.0
2371759,0,377160,2016-03-31 00:00:00.000,1.0
436692,0,227940,2016-07-21 00:00:00.000,1.0
499895,0,304240,2016-09-15 00:00:00.000,1.0
2211182,0,550650,2017-10-25 00:00:00.000,1.0
...,...,...,...,...
1469646,279289,242680,2015-03-23 00:00:00.000,1.0
2265710,279289,99900,2015-09-16 00:00:00.000,1.0
4269027,279289,268910,2017-10-11 00:00:00.000,1.0
5451032,279289,252610,2017-11-22 00:00:00.000,1.0


In [24]:
# Encode item ids
unique_items = sessions_df["ItemId"].unique()
item_id_mapping = {value: idx for idx, value in enumerate(unique_items)}
sessions_df["ItemId"] = sessions_df["ItemId"].map(item_id_mapping)
sessions_df

Unnamed: 0,SessionId,ItemId,Time,Reward
2697132,0,0,2016-03-04 00:00:00.000,1.0
2371759,0,1,2016-03-31 00:00:00.000,1.0
436692,0,2,2016-07-21 00:00:00.000,1.0
499895,0,3,2016-09-15 00:00:00.000,1.0
2211182,0,4,2017-10-25 00:00:00.000,1.0
...,...,...,...,...
1469646,279289,269,2015-03-23 00:00:00.000,1.0
2265710,279289,148,2015-09-16 00:00:00.000,1.0
4269027,279289,124,2017-10-11 00:00:00.000,1.0
5451032,279289,2871,2017-11-22 00:00:00.000,1.0


In [25]:
cat_df["ItemId"] = cat_df["ItemId"].map(item_id_mapping)
cat_df = cat_df.dropna()
cat_df["ItemId"] = cat_df["ItemId"].astype(int)
cat_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df["ItemId"] = cat_df["ItemId"].map(item_id_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df["ItemId"] = cat_df["ItemId"].astype(int)


Unnamed: 0,ItemId,Name,Category,Tag
1,9169,Ironbound,Free to Play_Indie_RPG_Strategy,2D_Board Game_Card Game_Character Customizatio...
2,9876,Real Pool 3D - Poolians,Casual_Free to Play_Indie_Simulation_Sports,Casual_Free to Play_Indie_Multiplayer_Simulati...
4,11010,Battle Royale Trainer,Action_Adventure_Simulation,Action_Adventure_FPS_Shooter_Simulation_Sniper...
21,6214,Carmageddon Max Pack,Action_Indie_Racing,1990's_Action_Classic_Gore_Indie_Multiplayer_R...
22,240,Half-Life,Action,1990's_Action_Adventure_Aliens_Atmospheric_Cla...
...,...,...,...,...
28809,7201,Chicken Shoot Gold,Action_Casual_Indie,Action_Casual_Indie_Shooter
28810,330,Day of Defeat,Action,Action_Class-Based_Classic_Co-op_FPS_First-Per...
28811,10759,Geneforge 2,Indie_RPG_Strategy,Indie_Isometric_RPG_Strategy_Turn-Based_Turn-B...
28812,1128,Unreal Tournament 2004: Editor's Choice Edition,Action,Action_Adventure_Arena Shooter_Atmospheric_Cla...


In [26]:
cat_df = cat_df.sort_values(by="ItemId")
cat_df

Unnamed: 0,ItemId,Name,Category,Tag
5359,0,CUPID - A free to play Visual Novel,Free to Play_Indie_RPG_Simulation,Anime_Dark_Female Protagonist_Free to Play_Hor...
21895,1,Fallout 4,RPG,Action_Action RPG_Adventure_Atmospheric_Casual...
7080,2,Heroes &amp; Generals,Action_Free to Play_Indie_Massively Multiplayer,Action_Adventure_Atmospheric_Co-op_FPS_First-P...
2981,3,Resident Evil / biohazard HD REMASTER,Action_Adventure,Action_Adventure_Atmospheric_Classic_Cult Clas...
14721,4,Black Squad,Action_Early Access_Free to Play,Action_Adventure_Co-op_Early Access_FPS_First-...
...,...,...,...,...
16392,11779,Five Elements,Indie_RPG_Strategy,Indie_RPG_Strategy
7658,11780,THE GREY MAN,Adventure_Casual_Indie,Adventure_Casual_Indie
21057,11781,Mysteries of the Past: Shadow of the Daemon Co...,Adventure_Indie,Adventure_Hidden Object_Indie
16690,11782,Otome Romance Jigsaws - Midnight Cinderella &a...,Adventure_Casual_Simulation_Strategy,Adventure_Anime_Casual_Otome_Puzzle_Simulation...


In [27]:
# Encode categories
unique_cats = cat_df["Category"].unique()
cat_mapping = {value: idx for idx, value in enumerate(unique_cats)}
cat_df["class"] = cat_df["Category"].map(cat_mapping)
cat_df

Unnamed: 0,ItemId,Name,Category,Tag,class
5359,0,CUPID - A free to play Visual Novel,Free to Play_Indie_RPG_Simulation,Anime_Dark_Female Protagonist_Free to Play_Hor...,0
21895,1,Fallout 4,RPG,Action_Action RPG_Adventure_Atmospheric_Casual...,1
7080,2,Heroes &amp; Generals,Action_Free to Play_Indie_Massively Multiplayer,Action_Adventure_Atmospheric_Co-op_FPS_First-P...,2
2981,3,Resident Evil / biohazard HD REMASTER,Action_Adventure,Action_Adventure_Atmospheric_Classic_Cult Clas...,3
14721,4,Black Squad,Action_Early Access_Free to Play,Action_Adventure_Co-op_Early Access_FPS_First-...,4
...,...,...,...,...,...
16392,11779,Five Elements,Indie_RPG_Strategy,Indie_RPG_Strategy,130
7658,11780,THE GREY MAN,Adventure_Casual_Indie,Adventure_Casual_Indie,104
21057,11781,Mysteries of the Past: Shadow of the Daemon Co...,Adventure_Indie,Adventure_Hidden Object_Indie,30
16690,11782,Otome Romance Jigsaws - Midnight Cinderella &a...,Adventure_Casual_Simulation_Strategy,Adventure_Anime_Casual_Otome_Puzzle_Simulation...,601


In [28]:
sessions_df.to_csv("sessions.csv", index=False)
cat_df.to_csv("item_metadata.csv", index=False)