In [12]:
import gzip
import pandas as pd
import ast

def parse(path):
    with gzip.open(path, 'rt', encoding='utf-8') as g:
        for l in g:
            yield ast.literal_eval(l)

def load_steam_games(filepath):
    steam_games = []
    for record in parse(filepath):
        app_id = record.get('id') 
        genres = record.get('genres', [])  
        steam_games.append({'item_id': app_id, 'genres': genres})
    return pd.DataFrame(steam_games)

def load_user_items(filepath):
    user_items = []
    for record in parse(filepath):
        user_id = record.get('user_id')
        for item in record.get('items', []):
            user_items.append({
                'user_id': user_id,
                'item_id': item.get('item_id'),
                'item_name': item.get('item_name'),
                'playtime_forever': item.get('playtime_forever', 0),
                'playtime_2weeks': item.get('playtime_2weeks', 0),
                'genre': item.get('genre', None),
                'purchase': item.get('purchase', None)
            })
    return pd.DataFrame(user_items)


def enrich_user_data_with_genres(user_items_df, steam_games_df):
    steam_games_df['genres'] = steam_games_df['genres'].apply(lambda x: ', '.join(x) if isinstance(x, list) else None)
    enriched_df = pd.merge(user_items_df, steam_games_df, on='item_id', how='left')
    enriched_df['genre'] = enriched_df['genre'].combine_first(enriched_df['genres'])
    enriched_df.drop(columns=['genres'], inplace=True)
    return enriched_df

user_items_path = 'australian_users_items.json.gz'
steam_games_path = 'steam_games.json.gz'

user_items_df = load_user_items(user_items_path)
steam_games_df = load_steam_games(steam_games_path)

enriched_user_items_df = enrich_user_data_with_genres(user_items_df, steam_games_df)

output_path = 'enriched_australian_users_items.csv'
enriched_user_items_df.to_csv(output_path, index=False)

print(f"Enriched data saved to {output_path}")



Enriched data saved to enriched_australian_users_items.csv
