In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns

In [2]:
steam_description_df = pd.read_csv("steam_description_data.csv")
steam_media_df = pd.read_csv("steam_media_data.csv")
steam_requirements_df = pd.read_csv("steam_requirements_data.csv")
steam_support_df = pd.read_csv("steam_support_info.csv")
steam_user = pd.read_csv("steam_user_behavior.csv")
steam_df = pd.read_csv("steam.csv")
steamspy_tag_df = pd.read_csv("steamspy_tag_data.csv")

In [3]:
# Crear las columnas "purchase" y "play"
purchase_data = steam_user[steam_user['Action'] == 'purchase'].copy()
play_data = steam_user[steam_user['Action'] == 'play'].copy()

# Agregar la columna "_purchase" y "_play"
purchase_data['Value_purchase'] = 1
play_data['Value_play'] = play_data['Value']

# Fusionar los datos
merged_data = pd.merge(purchase_data, play_data, on=['UserID', 'Name'], how='left')
merged_data['Value_play'].fillna(0, inplace=True)

# Eliminar columnas innecesarias
merged_data.drop(['Action_x', 'Action_y', 'Value_x', 'Value_y'], axis=1, inplace=True)
steam_user_clean = merged_data.drop('Value_purchase', axis=1)


# Ver el resultado
print(steam_user_clean.head())

      UserID                        Name  Value_play
0  151603712  The Elder Scrolls V Skyrim       273.0
1  151603712                   Fallout 4        87.0
2  151603712                       Spore        14.9
3  151603712           Fallout New Vegas        12.1
4  151603712               Left 4 Dead 2         8.9


In [4]:
from sklearn.preprocessing import LabelEncoder

#Crear una instancia del LabelEncoder
encoder = LabelEncoder()

#Codificar la columna "Name"
steam_user_clean['Name_encoded'] = encoder.fit_transform(steam_user_clean['Name'])

#Eliminar la columna "Name"
steam_user_clean.drop('Name', axis=1, inplace=True)

# Reorganizar las columnas
steam_user_clean = steam_user_clean[['UserID', 'Name_encoded', 'Value_play']]

print(steam_user_clean.head())

      UserID  Name_encoded  Value_play
0  151603712          4364       273.0
1  151603712          1678        87.0
2  151603712          3997        14.9
3  151603712          1679        12.1
4  151603712          2475         8.9


In [5]:
from scipy.stats import percentileofscore

# Calcular el percentil de las horas jugadas
percentiles = steam_user_clean['Value_play'].apply(lambda x: percentileofscore(steam_user_clean['Value_play'], x))

# Mapear los percentiles a un rango de 0 a 100
rating = percentiles.apply(lambda x: int(x * 100 / 100))

# Agregar la columna "Rating" al DataFrame
steam_user_clean['Rating'] = rating

In [9]:
steam_user_clean.head(100)
steam_user_clean.to_csv('steam_user_clean.csv', index=False)

In [10]:
# Cambiar el nombre de la columna "appid" a "steam_appid"
steam_df.rename(columns={"appid": "steam_appid"}, inplace=True)
steamspy_tag_df.rename(columns={"appid": "steam_appid"}, inplace=True)

In [11]:
#Combinamos los datasets de la info de los videojuegos
games_df = steam_description_df.merge(steam_media_df, on="steam_appid").merge(steam_requirements_df, on="steam_appid").merge(steam_support_df, on="steam_appid").merge(steam_df, on="steam_appid").merge(steamspy_tag_df, on="steam_appid")

In [13]:
games_df.head()
games_df.to_csv('games_df.csv', index=False)

In [14]:
# Crear un nuevo DataFrame con columnas seleccionadas
games_df_clean = games_df[['steam_appid', 'short_description', 'name','release_date', 'developer', 'platforms','categories', 'genres', 'steamspy_tags','price']]

In [15]:
games_df_clean.to_csv('games_df_clean.csv', index=False)