# Analisis Exploratorio de datos

In [25]:
import pandas as pd

import os

## Steam Games Limpieza

- Eliminacion de columnas nulas.
- Tratamiento de filas duplicadas.

In [26]:
path_data =  os.path.join('..','data','raw','steam_games.json.gz')

steam_games = pd.read_json(path_data, compression='gzip',lines=True)
 

In [27]:
# Eliminacion de filas que todos sus datos son nulos.
steam_games.dropna(how='all',inplace=True)

In [28]:
# Datos duplicados
steam_games[steam_games['id'].duplicated(keep=False)]


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88384,,,,,http://store.steampowered.com/,,,,,19.99,0.0,,
102204,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,0.0,612880.0,Machine Games
102883,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,0.0,612880.0,Machine Games
119271,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,"[Single-player, Steam Achievements, Steam Trad...",19.99,0.0,,"Rocksteady Studios,Feral Interactive (Mac)"


In [29]:
# Eliminación de datos duplicados

index_duplicados_drop = [88384,102204,119271]
steam_games.drop(index=index_duplicados_drop, axis=1, inplace=True)

In [30]:
# Transforma una lista de elementos en un texto separado por coma
def swap_to_text(x):
  """Convierte una lista de strings en un solo texto separado por comas y espacios."""
  if x is not None:
      return ', '.join(x)
  return x


steam_games['genres'] = steam_games['genres'].apply(swap_to_text)
steam_games['tags'] = steam_games['tags'].apply(swap_to_text)
steam_games['specs'] = steam_games['specs'].apply(swap_to_text)

### Tratamiento de datos nulos

In [31]:
pd.DataFrame(steam_games.isna().sum(),columns=['Nulls'])

Unnamed: 0,Nulls
publisher,8051
genres,3282
app_name,1
title,2049
url,0
release_date,2066
tags,162
reviews_url,0
specs,669
price,1377


#### Analisis puntual de nulos

In [32]:
# Eliminaremos las filas con app_name  nulo 
indices_a_borrar = steam_games[steam_games.app_name.isna()].index
steam_games.drop(index=indices_a_borrar, inplace=True)

# Rellenamos los titulos vacios con la columna app_name
steam_games['title'] = steam_games['app_name'].fillna(steam_games['title'])

# Completar los generos vacios con las etiquetas de los Tags
steam_games['genres'] = steam_games['tags'].fillna(steam_games['genres'])

# Si el dato en publisher se encuentrar nulo, entonces vamos a asumir que fue el desarrollador el publicador
steam_games['publisher'].fillna(steam_games['developer'],inplace=True)

# Si el dato en desarrollador se encuentrar nulo, entonces vamos a asumir que fue el publicador
steam_games['developer'].fillna(steam_games['publisher'],inplace=True)

# Si no hay datos del desarrollador y el publicador, entonces precederemos llenar 'otros'
steam_games['publisher'].fillna('Otros', inplace=True)
steam_games['developer'].fillna('Otros', inplace=True)


#### Capitalización de texto

In [33]:
# Capitalizacion de texto
columns_capitalize = ['developer', 'publisher', 'app_name','title']

for column in columns_capitalize:
  steam_games[column] = steam_games[column].str.capitalize()
  
# Eliminacion de espacios en blanco
steam_games[columns_capitalize] = steam_games[columns_capitalize].apply(lambda x: x.str.strip())

#### Tratamiento de fechas

In [34]:
# Cambiamos a formato fecha columna release_date y pusumos nulos los que no tenian un formato adecuado para poder establecer fecha
steam_games['release_date'] = pd.to_datetime(steam_games['release_date'], format='%Y-%m-%d', errors='coerce')


In [35]:
# Interpolamos fechas nulas usando interpolate. Este método lo he utilzado porque evita que cambie la distrbución de mis datos 
steam_games['release_date'] = steam_games['release_date'].interpolate()
steam_games['release_date'] = pd.to_datetime(steam_games['release_date'], format='%Y-%m-%d', errors='coerce')


In [36]:
# Crearemos las nuevas columnas 'release_year' y 'release_month'
steam_games['release_year'] = steam_games['release_date'].dt.year
steam_games['release_month'] = steam_games['release_date'].dt.month
steam_games.drop(columns='release_date',inplace=True)

In [37]:
steam_games.isna().sum()

publisher           0
genres            138
app_name            0
title               0
url                 0
tags              162
reviews_url         0
specs             669
price            1376
early_access        0
id                  0
developer           0
release_year        0
release_month       0
dtype: int64

### Analísis de columna Price

In [38]:
steam_games['price'].str.contains(r'\D+', case=False, na=False)

88310     False
88311      True
88312      True
88313     False
88314     False
          ...  
120440    False
120441    False
120442    False
120443    False
120444    False
Name: price, Length: 32131, dtype: bool

In [39]:
def filtro_reg_exp(reg_exp, df):
  "Filtra una columan por una expresion regular reg_exp "
  
  mask = df.str.contains(reg_exp, case=False, na=False)
  return df[mask]

In [40]:
# Encontremos los valores de precio que son de tipo str
filtro_reg_exp(r'\D+' ,steam_games['price']).unique()

array(['Free To Play', 'Free to Play', 'Free', 'Free Demo',
       'Play for Free!', 'Install Now', 'Play WARMACHINE: Tactics Demo',
       'Free Mod', 'Install Theme', 'Third-party', 'Play Now',
       'Free HITMAN™ Holiday Pack', 'Play the Demo',
       'Starting at $499.00', 'Starting at $449.00', 'Free to Try',
       'Free Movie', 'Free to Use'], dtype=object)

In [41]:
input_price = {'Free To Play': 0, 
              'Free to Play': 0, 
              'Free': 0, 
              'Free Demo': 0,
              'Play for Free!': 0,
              'Install Now': 0, 
              'Play WARMACHINE: Tactics Demo': 0,
              'Free Mod': 0, 
              'Install Theme': 0, 
              'Third-party': 0,
              'Play Now': 0,
              'Free HITMAN™ Holiday Pack': 0, 
              'Play the Demo': 0,
              'Starting at $499.00': 499, 
              'Starting at $449.00': 449, 
              'Free to Try': 0,
              'Free Movie': 0, 
              'Free to Use': 0}

#Imputacion de precios
steam_games['price'] = steam_games['price'].replace(input_price)

### Imputacion de data

In [43]:
steam_games.isna().sum()

publisher           0
genres            138
app_name            0
title               0
url                 0
tags              162
reviews_url         0
specs             669
price            1376
early_access        0
id                  0
developer           0
release_year        0
release_month       0
dtype: int64

In [45]:
steam_games[steam_games['id']== 10]

Unnamed: 0,publisher,genres,app_name,title,url,tags,reviews_url,specs,price,early_access,id,developer,release_year,release_month
120416,Valve,"Action, FPS, Multiplayer, Shooter, Classic, Te...",Counter-strike,Counter-strike,http://store.steampowered.com/app/10/CounterSt...,"Action, FPS, Multiplayer, Shooter, Classic, Te...",http://steamcommunity.com/app/10/reviews/?brow...,"Multi-player, Valve Anti-Cheat enabled",9.99,0.0,10.0,Valve,2000,11
