# Proceso de ETL: Steam_games

### Librerias necesarias

In [50]:
import pandas as pd 
import json
import html
import warnings
warnings.filterwarnings("ignore")

## Extraction y primera exploración: 
Datos de archivo JSON se convierten a Dataframe para examinar su contenido.

In [51]:
 #Leer el archivo JSON
games = []
with open('../DatasetsSTEAM/output_steam_games.json', 'r') as f:
    for line in f.readlines():
        data = json.loads(line)
        games.append(data)

#convertirlo a un dataframe
df_games = pd.DataFrame(games)

In [52]:
df_games.head(3)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns"


## Transformación: 
Se observan filas que contienen vacíos en todo el registro. Se eliminan dichos registros.

In [53]:
df_games = df_games.dropna(how='all').reset_index(drop=True)
df_games.shape

(32135, 13)

In [54]:
# Verificamos si existen valores nulos en cada columna
df_games.isnull().sum().sort_values(ascending=False)

publisher       8052
developer       3299
genres          3283
release_date    2067
title           2050
price           1377
specs            670
tags             163
app_name           2
reviews_url        2
id                 2
url                0
early_access       0
dtype: int64

### Verificación de duplicados

In [55]:
# Verificar registros duplicados en la columna 'id'
duplicados = df_games[df_games.duplicated(subset=['id'], keep=False)]
duplicados

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
74,,,,,http://store.steampowered.com/,,,,,19.99,False,,
13894,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880.0,Machine Games
14573,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880.0,Machine Games
30961,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,"[Single-player, Steam Achievements, Steam Trad...",19.99,False,,"Rocksteady Studios,Feral Interactive (Mac)"


In [56]:
# se eliminan los index duplicados 
index_a_eliminar = [14573, 74, 30961]
df_games = df_games.drop(index_a_eliminar)

### Transformación de la columna 'release_date'


Extraemos el año de lanzamiento del juego, para ello se crea una nueva columna con el dato. Luego se elimina la columa 'release_date'.

In [57]:
# Convertir la columna 'release_date' al formato de fecha y hora, manejar errores
df_games['release_date'] = pd.to_datetime(df_games['release_date'], errors='coerce')

# Extraer el año y crear una nueva columna 'release_year' de las fechas válidas
df_games['release_year'] = df_games['release_date'].dt.year

# elimina la columna 'release_date'
df_games = df_games.drop('release_date', axis=1)
df_games.head(3)

Unnamed: 0,publisher,genres,app_name,title,url,tags,reviews_url,specs,price,early_access,id,developer,release_year
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,2018.0
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,2017.0
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域,2017.0
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,,


In [58]:
df_games.isnull().sum().sort_values(ascending=False)

publisher       8051
developer       3298
genres          3282
release_year    2351
title           2049
price           1377
specs            669
tags             162
app_name           1
url                0
reviews_url        0
early_access       0
id                 0
dtype: int64

Eliminamos las filas con valores NaN (valores nulos) en la columna 'release_year' y reindexamos el DataFrame

In [59]:
# Eliminar filas con valores NaN en 'release_year' y reindexar el DataFrame
df_games.dropna(subset=['release_year'], inplace=True)
df_games.reset_index(drop=True, inplace=True)

## Transformación de la columna 'price'

Se necesita trabajar con esta columna, pero hay valores de texto en casos de promociones o que el item es gratis. Por lo que se decide reemplazar esos valores por 0. Por otra parte, esta columna tiene algunos valores nulos, pero como mas adelante será necesario hacer operaciones, se adopta imputar como 0 los valores nulos.

In [60]:
def convertir_a_float(valor):
    if pd.isna(valor):
        return 0.0
    try:
        float_value = float(valor)
        return float_value
    except:
        return 0.0

# Reemplazar valores no numéricos y nulos en la columna por 0.0
df_games['price'] = df_games['price'].apply(convertir_a_float)


# Mostrar los valores con el formato deseado
df_games['price'] = df_games['price'].apply(lambda x: '{:.2f}'.format(x))



In [62]:
# se observa el tipo de dato en 'price'
df_games['price'].unique()

array(['4.99', '0.00', '0.99', '3.99', '9.99', '18.99', '29.99', '10.99',
       '2.99', '1.59', '14.99', '1.99', '59.99', '8.99', '6.99', '7.99',
       '39.99', '19.99', '7.49', '12.99', '5.99', '2.49', '15.99', '1.25',
       '24.99', '17.99', '61.99', '3.49', '11.99', '13.99', '34.99',
       '1.49', '32.99', '99.99', '14.95', '69.99', '16.99', '79.99',
       '49.99', '5.00', '44.99', '13.98', '29.96', '119.99', '109.99',
       '149.99', '771.71', '21.99', '89.99', '0.98', '139.92', '4.29',
       '64.99', '54.99', '74.99', '0.89', '0.50', '299.99', '1.29',
       '3.00', '15.00', '5.49', '23.99', '49.00', '20.99', '10.93',
       '1.39', '36.99', '4.49', '2.00', '4.00', '234.99', '1.95', '1.50',
       '199.00', '189.00', '6.66', '27.99', '10.49', '129.99', '179.00',
       '26.99', '399.99', '31.99', '399.00', '20.00', '40.00', '3.33',
       '22.99', '320.00', '38.85', '71.70', '995.00', '27.49', '3.39',
       '6.00', '19.95', '499.99', '199.99', '16.06', '4.68', '131.40',
  

### Transformación columna 'genres'
La columna 'genres' esta formada por una lista de los distintos géneros de los videojuegos. Se necesita crear múltiples filas para cada lista de géneros.

In [63]:
# Divide las filas que contienen listas en la columna 'genres', creando múltiples filas para cada lista de géneros
df_games = df_games.explode('genres')

# Elimina las filas que contienen valores nulos en la columna 'genres'
df_games = df_games.dropna(subset=['genres'])

# Muestra las primeras filas del DataFrame 'df_games' después de realizar las transformaciones anteriores
df_games.head(3)

Unnamed: 0,publisher,genres,app_name,title,url,tags,reviews_url,specs,price,early_access,id,developer,release_year
0,Kotoshiro,Action,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
0,Kotoshiro,Casual,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
0,Kotoshiro,Indie,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
0,Kotoshiro,Simulation,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
0,Kotoshiro,Strategy,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0


Tambien es necesario que los valores unicos de la columna 'genres' este normalizado

In [64]:
# Obtener valores únicos de la columna 'genres' y ordenarlos de forma ascendente
valores_unicos_genres = sorted(df_games['genres'].unique())
valores_unicos_genres

['Accounting',
 'Action',
 'Adventure',
 'Animation &amp; Modeling',
 'Audio Production',
 'Casual',
 'Design &amp; Illustration',
 'Early Access',
 'Education',
 'Free to Play',
 'Indie',
 'Massively Multiplayer',
 'Photo Editing',
 'RPG',
 'Racing',
 'Simulation',
 'Software Training',
 'Sports',
 'Strategy',
 'Utilities',
 'Video Production',
 'Web Publishing']

Se observa que la columna 'genres' tiene algunas filas con una representación HTML del carácter '&' en el texto. Vamos a representarlas correctamente como '&'.

In [65]:
# Define una función para corregir los valores en la columna 'genres'
def corregir_amp(text):
    return html.unescape(text) if '&amp;' in text else text

# Aplica la función a la columna 'genres'
df_games['genres'] = df_games['genres'].apply(corregir_amp)

### Columnas 'app_name','specs','early_access','publisher','tags','reviews_url','price', 'developer'
Estas columnas, no se utilizarán para las funciones de la API ni para el modelo de recomendación, por lo tanto se eliminan del dataframe.

In [68]:
# Elimina columnas específicas 
df_games.drop(columns=['app_name','specs','early_access','publisher','tags','reviews_url','price','developer','url'],inplace=True)
df_games.head(3)

Ultimo retoque para un futuro merge con la tabla items que nos permita unificar las tres tablas

In [69]:
# Renombra la columna 'id' a 'item_id' en el DataFrame df_games
df_games.rename(columns={"id":"item_id"},inplace=True)
df_games.reset_index(drop=True, inplace=True)
df_games.head(3)

Unnamed: 0,publisher,genres,app_name,title,url,tags,reviews_url,specs,price,early_access,item_id,developer,release_year
0,Kotoshiro,Action,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
1,Kotoshiro,Casual,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
2,Kotoshiro,Indie,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0


## Carga del dataset
Se guarda el dataframe transformado como steam_games.parquet

In [70]:
# Guarda el DataFrame df_games en formato Parquet en el archivo 'steam_games.parquet' dentro de la carpeta 'Datasets'
df_games.to_parquet('../Datasets/steam_games.parquet')