# Analisis Exploratorio de datos

In [1]:
import pandas as pd

import os

## Steam Games Limpieza

- Eliminacion de columnas nulas.
- Tratamiento de filas duplicadas.

In [2]:
path_data =  os.path.join('..','data','raw','steam_games.json.gz')

steam_games = pd.read_json(path_data, compression='gzip',lines=True)
 

In [3]:
# Eliminacion de filas que todos sus datos son nulos.
steam_games.dropna(how='all',inplace=True)

In [4]:
# Datos duplicados
steam_games[steam_games['id'].duplicated(keep=False)]


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88384,,,,,http://store.steampowered.com/,,,,,19.99,0.0,,
102204,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,0.0,612880.0,Machine Games
102883,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,0.0,612880.0,Machine Games
119271,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,"[Single-player, Steam Achievements, Steam Trad...",19.99,0.0,,"Rocksteady Studios,Feral Interactive (Mac)"


In [5]:
# Eliminación de datos duplicados

index_duplicados_drop = [88384,102204,119271]
steam_games.drop(index=index_duplicados_drop, axis=1, inplace=True)

In [6]:
# Transforma una lista de elementos en un texto separado por coma
def swap_to_text(x):
  """Convierte una lista de strings en un solo texto separado por comas y espacios."""
  if x is not None:
      return ', '.join(x)
  return x


steam_games['genres'] = steam_games['genres'].apply(swap_to_text)
steam_games['tags'] = steam_games['tags'].apply(swap_to_text)
steam_games['specs'] = steam_games['specs'].apply(swap_to_text)

### Tratamiento de datos nulos

In [7]:
pd.DataFrame(steam_games.isna().sum(),columns=['Nulls'])

Unnamed: 0,Nulls
publisher,8051
genres,3282
app_name,1
title,2049
url,0
release_date,2066
tags,162
reviews_url,0
specs,669
price,1377


#### Analisis puntual de nulos

In [8]:
# Eliminaremos las filas con app_name  nulo 
indices_a_borrar = steam_games[steam_games.app_name.isna()].index
steam_games.drop(index=indices_a_borrar, inplace=True)

# Rellenamos los titulos vacios con la columna app_name
steam_games['title'] = steam_games['app_name'].fillna(steam_games['title'])

# Completar los generos vacios con las etiquetas de los Tags
steam_games['genres'] = steam_games['tags'].fillna(steam_games['genres'])

# Si el dato en publisher se encuentrar nulo, entonces vamos a asumir que fue el desarrollador el publicador
steam_games['publisher'].fillna(steam_games['developer'],inplace=True)

# Si el dato en desarrollador se encuentrar nulo, entonces vamos a asumir que fue el publicador
steam_games['developer'].fillna(steam_games['publisher'],inplace=True)

# Si no hay datos del desarrollador y el publicador, entonces precederemos llenar 'otros'
steam_games['publisher'].fillna('Otros', inplace=True)
steam_games['developer'].fillna('Otros', inplace=True)


#### Capitalización de texto

In [9]:
# Capitalizacion de texto
columns_capitalize = ['developer', 'publisher', 'app_name','title']

for column in columns_capitalize:
  steam_games[column] = steam_games[column].str.capitalize()
  
# Eliminacion de espacios en blanco
steam_games[columns_capitalize] = steam_games[columns_capitalize].apply(lambda x: x.str.strip())

#### Tratamiento de fechas

In [10]:
# Cambiamos a formato fecha columna release_date y pusumos nulos los que no tenian un formato adecuado para poder establecer fecha
steam_games['release_date'] = pd.to_datetime(steam_games['release_date'], format='%Y-%m-%d', errors='coerce')


In [11]:
# Interpolamos fechas nulas usando interpolate. Este método lo he utilzado porque evita que cambie la distrbución de mis datos 
steam_games['release_date'] = steam_games['release_date'].interpolate()
steam_games['release_date'] = pd.to_datetime(steam_games['release_date'], format='%Y-%m-%d', errors='coerce')


In [12]:
# Crearemos las nuevas columnas 'release_year' y 'release_month'
steam_games['release_year'] = steam_games['release_date'].dt.year
steam_games['release_month'] = steam_games['release_date'].dt.month
steam_games.drop(columns='release_date',inplace=True)

In [13]:
steam_games.isna().sum()

publisher           0
genres            138
app_name            0
title               0
url                 0
tags              162
reviews_url         0
specs             669
price            1376
early_access        0
id                  0
developer           0
release_year        0
release_month       0
dtype: int64

### Analísis de columna Price

In [14]:
steam_games['price'].str.contains(r'\D+', case=False, na=False)

88310     False
88311      True
88312      True
88313     False
88314     False
          ...  
120440    False
120441    False
120442    False
120443    False
120444    False
Name: price, Length: 32131, dtype: bool

In [15]:
def filtro_reg_exp(reg_exp, df):
  "Filtra una columan por una expresion regular reg_exp "
  mask = df.str.contains(reg_exp, case=False, na=False)
  return df[mask]

In [16]:
# Encontremos los valores de precio que son de tipo str
filtro_reg_exp(r'\D+' ,steam_games['price']).unique()

array(['Free To Play', 'Free to Play', 'Free', 'Free Demo',
       'Play for Free!', 'Install Now', 'Play WARMACHINE: Tactics Demo',
       'Free Mod', 'Install Theme', 'Third-party', 'Play Now',
       'Free HITMAN™ Holiday Pack', 'Play the Demo',
       'Starting at $499.00', 'Starting at $449.00', 'Free to Try',
       'Free Movie', 'Free to Use'], dtype=object)

In [17]:
input_price = {'Free To Play': 0, 
              'Free to Play': 0, 
              'Free': 0, 
              'Free Demo': 0,
              'Play for Free!': 0,
              'Install Now': 0, 
              'Play WARMACHINE: Tactics Demo': 0,
              'Free Mod': 0, 
              'Install Theme': 0, 
              'Third-party': 0,
              'Play Now': 0,
              'Free HITMAN™ Holiday Pack': 0, 
              'Play the Demo': 0,
              'Starting at $499.00': 499, 
              'Starting at $449.00': 449, 
              'Free to Try': 0,
              'Free Movie': 0, 
              'Free to Use': 0}

#Imputacion de precios
steam_games['price'] = steam_games['price'].replace(input_price)

### Imputacion de data

In [18]:
steam_games

Unnamed: 0,publisher,genres,app_name,title,url,tags,reviews_url,specs,price,early_access,id,developer,release_year,release_month
88310,Kotoshiro,"Strategy, Action, Indie, Casual, Simulation",Lost summoner kitty,Lost summoner kitty,http://store.steampowered.com/app/761140/Lost_...,"Strategy, Action, Indie, Casual, Simulation",http://steamcommunity.com/app/761140/reviews/?...,Single-player,4.99,0.0,761140.0,Kotoshiro,2018,1
88311,"Making fun, inc.","Free to Play, Strategy, Indie, RPG, Card Game,...",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,"Free to Play, Strategy, Indie, RPG, Card Game,...",http://steamcommunity.com/app/643980/reviews/?...,"Single-player, Multi-player, Online Multi-Play...",0.00,0.0,643980.0,Secret level srl,2018,1
88312,Poolians.com,"Free to Play, Simulation, Sports, Casual, Indi...",Real pool 3d - poolians,Real pool 3d - poolians,http://store.steampowered.com/app/670290/Real_...,"Free to Play, Simulation, Sports, Casual, Indi...",http://steamcommunity.com/app/670290/reviews/?...,"Single-player, Multi-player, Online Multi-Play...",0.00,0.0,670290.0,Poolians.com,2017,7
88313,彼岸领域,"Action, Adventure, Casual",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,"Action, Adventure, Casual",http://steamcommunity.com/app/767400/reviews/?...,Single-player,0.99,0.0,767400.0,彼岸领域,2017,12
88314,Otros,"Action, Indie, Casual, Sports",Log challenge,Log challenge,http://store.steampowered.com/app/773570/Log_C...,"Action, Indie, Casual, Sports",http://steamcommunity.com/app/773570/reviews/?...,"Single-player, Full controller support, HTC Vi...",2.99,0.0,773570.0,Otros,2017,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_rus games,"Strategy, Indie, Casual, Simulation",Colony on mars,Colony on mars,http://store.steampowered.com/app/773640/Colon...,"Strategy, Indie, Casual, Simulation",http://steamcommunity.com/app/773640/reviews/?...,"Single-player, Steam Achievements",1.99,0.0,773640.0,"Nikita ""ghost_rus""",2018,1
120441,Sacada,"Strategy, Indie, Casual",Logistical: south africa,Logistical: south africa,http://store.steampowered.com/app/733530/LOGis...,"Strategy, Indie, Casual",http://steamcommunity.com/app/733530/reviews/?...,"Single-player, Steam Achievements, Steam Cloud...",4.99,0.0,733530.0,Sacada,2018,1
120442,Laush studio,"Indie, Simulation, Racing",Russian roads,Russian roads,http://store.steampowered.com/app/610660/Russi...,"Indie, Simulation, Racing",http://steamcommunity.com/app/610660/reviews/?...,"Single-player, Steam Achievements, Steam Tradi...",1.99,0.0,610660.0,Laush dmitriy sergeevich,2018,1
120443,Sixnails,"Indie, Casual, Puzzle, Singleplayer, Atmospher...",Exit 2 - directions,Exit 2 - directions,http://store.steampowered.com/app/658870/EXIT_...,"Indie, Casual, Puzzle, Singleplayer, Atmospher...",http://steamcommunity.com/app/658870/reviews/?...,"Single-player, Steam Achievements, Steam Cloud",4.99,0.0,658870.0,"Xropi,stev3ns",2017,9


In [68]:
steam_games.genres

88310           Strategy, Action, Indie, Casual, Simulation
88311     Free to Play, Strategy, Indie, RPG, Card Game,...
88312     Free to Play, Simulation, Sports, Casual, Indi...
88313                             Action, Adventure, Casual
88314                         Action, Indie, Casual, Sports
                                ...                        
120440                  Strategy, Indie, Casual, Simulation
120441                              Strategy, Indie, Casual
120442                            Indie, Simulation, Racing
120443    Indie, Casual, Puzzle, Singleplayer, Atmospher...
120444    Early Access, Adventure, Indie, Action, Simula...
Name: genres, Length: 32131, dtype: object