# ETL DEL DATASET STEAM GAMES
<p>Para realizar la extracción del dataset, primero procedemos a importar las librerías necesarias</p>


#### Importar librerías

In [2]:
#proporciona funcionalidades para trabajar con archivos comprimidos en el formato gzip (.gz)
import gzip

#proporciona herramientas para trabajar con datos en formato JSON
import json 

#se usará para para evaluar cada línea del archivo como una expresión de Python
#lo que permite cargar datos que no están en formato JSON.
import ast 

import numpy as np
import pandas as pd 

<p>Ahora, para abrir el archivo comprimido, utilizaremos un bucle -for- que almacena la data en una lista vacía en la variable "data"</p>

In [3]:
data = []
with gzip.open('steam_games.json.gz', 'rb') as f:
    for line in f:
        data.append(json.loads(line))
        # *  = (json.loads(line)) 
        # ** = (ast.literal_eval(line.decode('utf-8'))) esta opcion porque no es json 
steam_games = pd.DataFrame(data)
f.close()

<p>Analizaremos el contenido del archivo json utilizando los métodos head, tail e info, de manera que podamos hacernos a una idea del estado general del mismo.

In [4]:
steam_games.info()
steam_games

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 11.9+ MB


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns"


Se revisa el DataFrame steam_games y se evidencian valores nulos y datos semiestructurados en  las columnas 'genres', 'tags', y 'specs'.
Adicionalmente se normalizan los valores nulos y vacíos con NaN para posteriormente eliminarlos en su totalidad.

In [5]:
# Ahora se reemplazan valores vacíos, 'null' y 'None' con NaN
steam_games.replace(['', 'null', 'None'], np.nan, inplace=True)
steam_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [6]:
# Filtrar registros donde hay nulos o NaN
steam_games[steam_games.isna()]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,,,,,,,,,,,,,
120441,,,,,,,,,,,,,
120442,,,,,,,,,,,,,
120443,,,,,,,,,,,,,


In [7]:
#eliminamos las filas con valores vacíos y reiniciamos 
#el índice del DataFrame después de eliminar las filas.
steam_games = steam_games.dropna(how='all').reset_index(drop=True)
steam_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32130,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
32131,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
32132,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich
32133,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns"


In [8]:
#Se eliminan todos los valores NaN del dataset incluyendo las filas que los contienen.
steam_games_copy = steam_games.copy()
steam_games_copy.dropna(inplace=True)
steam_games = steam_games_copy
# Verifica la cantidad de valores NaN en el DataFrame steam_games
print(steam_games.isna().sum())

publisher       0
genres          0
app_name        0
title           0
url             0
release_date    0
tags            0
reviews_url     0
specs           0
price           0
early_access    0
id              0
developer       0
dtype: int64


In [9]:
# Cantidad de valores nulos en todas las columnas
print(steam_games.isnull().sum())
steam_games.shape

publisher       0
genres          0
app_name        0
title           0
url             0
release_date    0
tags            0
reviews_url     0
specs           0
price           0
early_access    0
id              0
developer       0
dtype: int64


(22521, 13)

Ahora analizamos la columna "release_date" con el fin de realizar algunas transformaciones en el formato de fecha.

In [10]:
steam_games["release_date"].head(10)

0     2018-01-04
1     2018-01-04
2     2017-07-24
3     2017-12-07
5     2018-01-04
7     2018-01-04
8     2018-01-04
12    2018-01-04
13    2018-01-04
14    2018-01-03
Name: release_date, dtype: object

In [11]:
#Convertimos la columa release_date a formato fecha
steam_games['release_date']=pd.to_datetime(steam_games['release_date'], errors='coerce', exact=False)

In [12]:
# Extraer el año y crear una nueva columna 'release_year'
steam_games['year'] = steam_games['release_date'].dt.year.astype('Int64')

In [13]:
# Eliminar la columna 'release_date'
steam_games = steam_games.drop(columns=['release_date'])

In [14]:
steam_games.tail()

Unnamed: 0,publisher,genres,app_name,title,url,tags,reviews_url,specs,price,early_access,id,developer,year
32129,Bidoniera Games,"[Action, Adventure, Casual, Indie]",Kebab it Up!,Kebab it Up!,http://store.steampowered.com/app/745400/Kebab...,"[Action, Indie, Casual, Violent, Adventure]",http://steamcommunity.com/app/745400/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",1.99,False,745400,Bidoniera Games,2018
32130,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS""",2018
32131,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada,2018
32132,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich,2018
32133,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",2017


Comparamos los valores de las columnas "app_name" y "title", con el fin de saber si tienen los mismos valores, de manera que podamos eliminar una de ellas. En este caso eliminaremos la columna app_name.

In [15]:
steam_games[["app_name","title"]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 22521 entries, 0 to 32133
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  22521 non-null  object
 1   title     22521 non-null  object
dtypes: object(2)
memory usage: 527.8+ KB


Ahora comparamos las columnas "publisher y "developer". El propósito es establecer si tienen los mismos registros como para tomar la decisión de eliminar una. 


In [16]:
#Comparamos si las columnas publisher y developer tienen similares registros
son_iguales = steam_games['publisher'] == steam_games['developer']
no_son_iguales = steam_games['publisher'] != steam_games['developer']

In [17]:
print(f"Son iguales {son_iguales.sum()}")
print(f"No son iguales {no_son_iguales.sum()}")

Son iguales 11528
No son iguales 10993


Las columnas no tienen valores NaN pero tampoco los mismos nombres en todos los casos, puesto que un editor no necesariamente es el mismo desarrollador.

In [18]:
# Renombrar columna "id" por "item_id"
steam_games.rename(columns={'id': 'item_id'}, inplace=True)

In [19]:
# Se eliminan las columnas 'app_name','url','reviews_url','early_access','publisher','release_date' por considerarse no relevantes
steam_games.drop(['app_name','url','reviews_url','early_access','publisher'], axis=1, inplace=True)

In [20]:
# Convertir 'price' a tipo numérico y las opciones'Free To Play' a cero.
steam_games['price'] = pd.to_numeric(steam_games['price'], errors='coerce')
steam_games.fillna({"price":0},inplace=True)

In [21]:
#Revisamos los cambios realizados
steam_games

Unnamed: 0,genres,title,tags,specs,price,item_id,developer,year
0,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,"[Strategy, Action, Indie, Casual, Simulation]",[Single-player],4.99,761140,Kotoshiro,2018
1,"[Free to Play, Indie, RPG, Strategy]",Ironbound,"[Free to Play, Strategy, Indie, RPG, Card Game...","[Single-player, Multi-player, Online Multi-Pla...",0.00,643980,Secret Level SRL,2018
2,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,"[Free to Play, Simulation, Sports, Casual, Ind...","[Single-player, Multi-player, Online Multi-Pla...",0.00,670290,Poolians.com,2017
3,"[Action, Adventure, Casual]",弹炸人2222,"[Action, Adventure, Casual]",[Single-player],0.99,767400,彼岸领域,2017
5,"[Action, Adventure, Simulation]",Battle Royale Trainer,"[Action, Adventure, Simulation, FPS, Shooter, ...","[Single-player, Steam Achievements]",3.99,772540,Trickjump Games Ltd,2018
...,...,...,...,...,...,...,...,...
32129,"[Action, Adventure, Casual, Indie]",Kebab it Up!,"[Action, Indie, Casual, Violent, Adventure]","[Single-player, Steam Achievements, Steam Cloud]",1.99,745400,Bidoniera Games,2018
32130,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,"[Strategy, Indie, Casual, Simulation]","[Single-player, Steam Achievements]",1.99,773640,"Nikita ""Ghost_RUS""",2018
32131,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,"[Strategy, Indie, Casual]","[Single-player, Steam Achievements, Steam Clou...",4.99,733530,Sacada,2018
32132,"[Indie, Racing, Simulation]",Russian Roads,"[Indie, Simulation, Racing]","[Single-player, Steam Achievements, Steam Trad...",1.99,610660,Laush Dmitriy Sergeevich,2018


In [26]:
#steam_games[steam_games["item_id"].str.contains("227300", case=False)]


## EXPORTAMOS EL CONJUNTO DE DATOS
<P> Guardamos el dataframe limpio en diferentes formatos que nos permitan manipularlo de acuerdo a la necesidad. 

In [23]:
# Los archivos se almacenan en la carpeta \data. Los formatos exportados son: csv, json, parquet.
#steam_games.to_csv('data\steam_games_cln.csv', index=False)
#steam_games.to_json('data\steam_games_cln.json', orient='records', lines=True)
#steam_games.to_parquet('data\steam_games_cln.parquet', index=False)