In [2]:
import pandas as pd
import numpy as np
import re
import math

In [3]:
#DECLARACION DE FUNCIONES

def asignar_precio(dato):
    
    if type(dato) == str:
        dato = dato.lower()

        if 'free' in dato:
            return float(0)#Si el dato contiene la palabra free se le asigna un precio $0
        
        if 'starting' in dato:
            precio = '0'
            precio = re.findall(r'\$(\S+)', dato)
            precio = ''.join(precio)
            precio = float(precio)#Si el dato contiene la palabra starting y un valor numerico devuelve dicho valor, en el caso que no haya valor numerico devolvera 0
            return precio
        return float(0)
        
    return dato

def extraer_anio(dato):
    anio = '1900'
    if type(dato) == str:
        anio = re.findall(r'\d{4}', dato)
        anio= ''.join(anio)       
        return anio
    return dato

def transformar_lista(dato):
    if type(dato) == list:
        dato = ' - '.join(dato)
        reemplazos = ["{", "}", "'", " ", '[', ']', ':', ',', '"']
        for elemento in reemplazos:
            dato = dato.replace(elemento, "")
        return dato
    else:
        return dato


In [4]:
#Se crea el dataframe df_steam_games con pandas accediendo directamente al archivo json ya que el mismo esta correcto
df1 = pd.read_json("C:/Users/eduen/AppData/Local/Temp/steam_games.json.gz", lines=True, compression='gzip')


In [6]:
df_steam_games = df1

In [7]:
#Eliminar las columnas 'user_id', 'steam_id', 'items', 'items_count' y 'metascore'
df_steam_games = df_steam_games.drop(columns=['user_id', 'steam_id', 'items', 'items_count', 'metascore'])
#Eliminar todas las filas en la que la variable 'title' posea valores nulos
df_steam_games = df_steam_games.dropna(subset=['title'])
#Eliminar filas repetidas en funcion del campo 'title'
df_steam_games = df_steam_games.drop_duplicates(subset=['title'])
#Reemplazar los valores tipo str por numericos en el campo 'price'
df_steam_games['price'] = df_steam_games['price'].apply(asignar_precio)
#Reemplazar valores nulos por la media en 'price'
df_steam_games['price'] = df_steam_games['price'].fillna(df_steam_games['price'].mean())
#Extraer año de fechas
df_steam_games['release_date'] = df_steam_games['release_date'].apply(extraer_anio)
#Campo 'release_date' a formato fecha
df_steam_games['release_date'] = pd.to_datetime(df_steam_games['release_date'], format= '%Y', errors= 'coerce').dt.year
df_steam_games['release_date'].fillna(1900, inplace= True)
df_steam_games['release_date'] = df_steam_games['release_date'].astype(int)
#Reemplazar datos nulos de variables categoricas por la leyenda 'Dato Desconocido'
df_steam_games['publisher'].fillna('Dato Desconocido', inplace= True)
df_steam_games['genres'].fillna('Dato Desconocido', inplace= True)
df_steam_games['tags'].fillna('Dato Desconocido', inplace= True)
df_steam_games['reviews_url'].fillna('Dato Desconocido', inplace= True)
df_steam_games['specs'].fillna('Dato Desconocido', inplace= True)
df_steam_games['developer'].fillna('Dato Desconocido', inplace= True)
df_steam_games['id'].fillna(df_steam_games['id'].max() + 1, inplace= True)
#Reemplazar datos nulos por cero en 'discount_price'
df_steam_games['discount_price'].fillna(0, inplace= True)
#Extraer datos de listas
df_steam_games = df_steam_games.map(transformar_lista)
#
df_steam_games['publisher'] = df_steam_games['publisher'].astype(str)

print(df_steam_games['genres'].unique())

['Action-Casual-Indie-Simulation-Strategy' 'FreetoPlay-Indie-RPG-Strategy'
 'Casual-FreetoPlay-Indie-Simulation-Sports' 'Action-Adventure-Casual'
 'Action-Adventure-Simulation' 'FreetoPlay-Indie-Simulation-Sports'
 'Casual-Indie-Racing-Simulation' 'Action-Indie-Simulation-EarlyAccess'
 'Action-Adventure-Casual-Indie-RPG' 'Casual-Indie'
 'Casual-Indie-Simulation' 'Adventure-Casual-Indie-Simulation-Strategy'
 'Action-Adventure-Indie' 'Racing-Simulation-Sports' 'Action-Indie'
 'Dato Desconocido' 'Action-Indie-Racing' 'Action' 'Action-Indie-RPG'
 'Casual-Indie-Strategy'
 'Action-Casual-Indie-MassivelyMultiplayer-Simulation-Sports-Strategy'
 'Adventure-Casual-Indie' 'Action-Adventure-RPG'
 'Adventure-Casual-Indie-RPG' 'Strategy'
 'Indie-MassivelyMultiplayer-RPG-EarlyAccess'
 'Adventure-FreetoPlay-RPG-EarlyAccess'
 'Action-Indie-Strategy-EarlyAccess' 'Action-Adventure-Indie-RPG-Strategy'
 'Action-Adventure-Indie-RPG' 'Casual-Strategy-EarlyAccess'
 'Adventure-Casual-FreetoPlay-Indie' 'Indie-S

In [8]:
df_steam_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30054 entries, 88310 to 120443
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   publisher       30054 non-null  object 
 1   genres          30054 non-null  object 
 2   app_name        30054 non-null  object 
 3   title           30054 non-null  object 
 4   url             30054 non-null  object 
 5   release_date    30054 non-null  int64  
 6   tags            30054 non-null  object 
 7   reviews_url     30054 non-null  object 
 8   discount_price  30054 non-null  float64
 9   specs           30054 non-null  object 
 10  price           30054 non-null  float64
 11  early_access    30054 non-null  float64
 12  id              30054 non-null  float64
 13  developer       30054 non-null  object 
dtypes: float64(4), int64(1), object(9)
memory usage: 3.4+ MB


In [9]:
df_steam_games.to_csv('steam_games', index= False)