In [15]:
import pandas as pd
import json
import numpy as np


In [16]:
df_games = pd.read_csv('data/games.csv')
df_games.head()


Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,10090,Call of Duty: World at War,2008-11-18,True,False,False,Very Positive,92,37039,19.99,19.99,0.0,True
1,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
2,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
3,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
4,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True


In [17]:
df_gen_users = pd.read_csv('data/generated_users.csv')
df_gen_users.head()

Unnamed: 0,steam_id,appids
0,76561198012345678,570730440550271590
1,76561198023456789,1085660381210105600252950620
2,76561198034567890,2304102920301172620582010359550
3,76561198045678901,413150294100227300435150386360
4,76561198056789012,346110377160221100252490239140


In [18]:

def extract_unique_tags(game_metadata_list):
    unique_tags = set()
    for game_metadata in game_metadata_list:
        tags = game_metadata.get("tags", [])
        unique_tags.update(tags)
    return list(unique_tags)


def write_tags_to_file(tags, filename):
    with open(filename, 'w') as f:
        for tag in tags:
            f.write(f"{tag}\n")

# Read game metadata JSON objects from a file
with open('data/games_metadata2.json', 'r') as f:
    game_metadata_jsons = [json.loads(line.strip()) for line in f if line.strip()]

# Read game metadata JSON objects from a file
# jsonFile = open('games_metadata.json', 'r')
# game_metadata_jsons = json.load(jsonFile)
# jsonFile.close()

def extract_unique_tags():
    # Extract unique tags
    unique_tags = extract_unique_tags(game_metadata_jsons)
    print("unique_tags", unique_tags)

    # Write unique tags to a file
    write_tags_to_file(unique_tags, 'unique_tags.txt')

    # Create a DataFrame with unique tags as columns
    df_game_tags_binary = pd.DataFrame(columns=unique_tags)

    for game_metadata in game_metadata_jsons:
        game_tags = game_metadata.get("tags", [])

        binary_tags = []
        for tag in unique_tags:
            if tag in game_tags:
                binary_tags.append(1)
            else:
                binary_tags.append(0)

        app_id = game_metadata['app_id']
        df_game_tags_binary.loc[app_id] = binary_tags

    df_game_tags_binary.head()
    # Save the DataFrame to a CSV file
    df_game_tags_binary.to_csv('game_tags_binary.csv')

# extract_unique_tags()

In [19]:
def kmeans(data, k, max_iters=100):
    # Initialize centroids randomly
    centroids = data[np.random.choice(data.shape[0], k, replace=False), :]

    for _ in range(max_iters):
        # Calculate distances between data points and centroids
        distances = np.linalg.norm(data[:, np.newaxis] - centroids, axis=2)

        # Assign each data point to the nearest centroid
        cluster_assignments = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.array([data[cluster_assignments == i].mean(axis=0) for i in range(k)])

        # Check for convergence
        if np.allclose(centroids, new_centroids):
            break

        centroids = new_centroids

    return cluster_assignments


# df_game_tags_binary = pd.read_csv('data/game_tags_binary.csv', index_col='Unnamed: 0')
# df_game_tags_binary.head()
# # # Convert DataFrame to NumPy array
# data = df_game_tags_binary.to_numpy()

# # Apply K-means clustering
# k = 2
# cluster_assignments = kmeans(data, k)

# # Replace the original features with the cluster assignments
# reduced_data = np.zeros_like(data)
# reduced_data[:, 0] = cluster_assignments

# # Convert the reduced data back to a DataFrame
# reduced_df = pd.DataFrame(reduced_data, index=df_game_tags_binary.index)

# # Save the reduced DataFrame to a CSV file
# reduced_df.to_csv('reduced_game_tags_kmeans.csv')

In [23]:
# Implement svd from the scratch

def power_iteration(A, num_iter=1000):
    b = np.random.rand(A.shape[1])

    for _ in range(num_iter):
        b = np.dot(A, b)
        b = b / np.linalg.norm(b)

    return b

def svd_basic(A, num_components, num_iter=1000):
    U = np.zeros((A.shape[0], num_components))
    S = np.zeros(num_components)
    Vt = np.zeros((num_components, A.shape[1]))

    for i in range(num_components):
        # Compute the matrix B
        B = np.dot(A.T, A)

        # Find the eigenvector with the largest eigenvalue using the power iteration method
        v = power_iteration(B, num_iter)
        Vt[i, :] = v

        # Find the corresponding singular value
        singular_value = np.sqrt(np.dot(v, np.dot(B, v)))
        S[i] = singular_value

        # Find the corresponding left singular vector
        u = np.dot(A, v) / singular_value
        U[:, i] = u

        # Deflate the matrix A
        A = A - singular_value * np.outer(u, v)

    return U, S, Vt

In [25]:
def svd_reduction(df, num_components):
    # Convert DataFrame to NumPy array
    data = df.to_numpy()

    # Calculate the mean of each column
    column_mean = np.mean(data, axis=0)

    # Center the data by subtracting the mean
    centered_data = data - column_mean

    # Compute the SVD using the basic SVD implementation
    U, S, Vt = svd_basic(centered_data, num_components)

    # Select the number of principal components
    U_reduced = U[:, :num_components]
    S_reduced = np.diag(S[:num_components])

    # Reconstruct the reduced data
    reduced_data = np.dot(U_reduced, S_reduced)

    # Flip the sign of the second column if necessary
    if reduced_data[0, 1] > 0:
        reduced_data[:, 1] = -reduced_data[:, 1]

    # Get the two highest loadings in the first two principal components
    columns_array = np.array(df.columns)
    top_tags = columns_array[np.argsort(-np.abs(Vt[:2, :]))[:, :2]]

    # Create column names based on the top tags
    column_names = [', '.join(tags) for tags in top_tags]

    # Convert the reduced data back to a DataFrame
    reduced_df = pd.DataFrame(reduced_data, index=df.index, columns=column_names)

    return reduced_df


# Load the DataFrame from a CSV file
df_game_tags_binary = pd.read_csv('data/game_tags_binary.csv', index_col='Unnamed: 0')

# Perform dimensionality reduction using SVD
num_components = 2
reduced_df = svd_reduction(df_game_tags_binary, num_components)

# Save the reduced DataFrame to a CSV file
reduced_df.to_csv('data/reduced_game_tags_custom_svd.csv')

# Display the reduced DataFrame
print(reduced_df)

        Open World, Hack and Slash  Survival, Horror
10090                    -0.647711         -2.564552
13500                     2.631718          1.539265
22364                    -0.056554          0.364005
113020                   -2.726232          1.764430
226560                    0.798778         -1.103148


In [14]:
df_reduced_game_tags = pd.read_csv('data/reduced_game_tags_svd_original_library.csv', index_col='Unnamed: 0')
df_reduced_game_tags.head()

Unnamed: 0,"Open World, Hack and Slash","Survival, Zombies"
10090,-0.647711,-2.564552
13500,2.631718,1.539265
22364,-0.056554,0.364005
113020,-2.726232,1.76443
226560,0.798778,-1.103148
