In [3]:
import pandas as pd 
import os, gzip 
import re
from funciones_aux.desempaquetado  import *


**Desanidamos los archivos user_reviews y users_items. Para solucionar esto usamos el modulo desempaquedo**

# ETL: Users_items

## Extracción de users_items

In [61]:
path_data_json =  os.path.join('..','data','raw','users_items.json.gz')
path_data_csv =  os.path.join('..','data','clear','users_items.csv.gz')

# user_items = desempaquetado_users_items(path_data_json, path_data_csv)

user_items = pd.read_csv(path_data_csv, compression='gzip')

## Transformación de users_items

### Eliminación de datos nulos
- Dado que no existe información suficiente para inputar datos en users_items procederemos a eliminar estas filas nulas

In [63]:
user_items.dropna(inplace=True)

### Capitalizacion

In [68]:
user_items['item_name'] = user_items['item_name'].str.capitalize() 

## Carga de data de users_items

In [64]:
path_data_csv =  os.path.join('..','data','clear','users_items.csv.gz')

user_items.to_csv(path_data_csv,index = False)


**Observación**

Inspeccionando users_items, con la funcón **desempaquetado_users_items** nos quedamos únicamente con las columnas que eran necesarias, la columna item_name la eliminamos ya que se puede obtener de steam games por medio de item_id.

# ETL User_Reviews

## Extraccion de user_reviews

In [59]:
path_data_json =  os.path.join('..','data','raw','user_reviews.json.gz')

reviews = desempaquetado_reviews(path_data_json)


## Tranformacion de user_reviews

### Eliminación de datos nulos
- Existen 28 usuarios que no tienen información, por lo que vamos a proceder a eliminar

In [4]:
reviews[reviews.isnull().any(axis=1)]

Unnamed: 0,user_id,funny,posted,last_edited,item_id,helpful,recommend,review
137,gdxsd,,,,,,,
177,76561198094224872,,,,,,,
2559,76561198021575394,,,,,,,
10080,cmuir37,,,,,,,
13767,Jaysteeny,,,,,,,
15493,ML8989,,,,,,,
19184,76561198079215291,,,,,,,
20223,76561198079342142,,,,,,,
25056,76561198061996985,,,,,,,
26257,76561198108286351,,,,,,,


In [5]:
reviews.dropna(inplace=True)

### Imputación de columna funny
- Esta columna contiene informacón de si a alguien le pareció divertido el comentario, vamos a extraer el número de personas a las que el comentario les pareció divertido. 

In [6]:

def num_is_funny(row):
  """ 
  Args:
      row (str): texto que quiero extraer el número.

  Returns:
      int : retorna el número de personas que le pareció divertido
  """
  
  match = re.search(r'(\d+) people found', row)
  if match :
    return int(match.group(1))
  return (0)

reviews['funny_count'] = reviews['funny'].apply(num_is_funny)

### Imputación columna helpful

In [7]:
pd.DataFrame(reviews['helpful'].unique(), columns=['Helpfull Unicos']).sample(10)

Unnamed: 0,Helpfull Unicos
153,83 of 106 people (78%) found this review helpful
555,35 of 102 people (34%) found this review helpful
613,145 of 207 people (70%) found this review helpful
1201,8 of 22 people (36%) found this review helpful
303,8 of 17 people (47%) found this review helpful
741,155 of 179 people (87%) found this review helpful
490,41 of 41 people (100%) found this review helpful
616,493 of 618 people (80%) found this review helpful
1025,129 of 160 people (81%) found this review helpful
517,1 of 29 people (3%) found this review helpful


In [8]:
reviews[reviews['helpful'] == '17 of 73 people (23%) found this review helpful']

Unnamed: 0,user_id,funny,posted,last_edited,item_id,helpful,recommend,review,funny_count
17797,OneEyedWolf,4 people found this review funny,"Posted January 10, 2015.",,340460,17 of 73 people (23%) found this review helpful,True,"Well.. this game is better than doing nothing,...",4


In [9]:
## De las personas que comentaron, calculamos el porcentaje de personas que si ayudó.add()
## Para no desbalancear la data, las personas que no tenian información las imputaremos con 
## 50 % es decir la misma probabilidad que le sirva o no el comentario.

reviews['percentage_helpful']  = reviews['helpful'].str.extract(r'\((\d+)%\)').astype(float)/100
reviews['percentage_helpful'] = reviews['percentage_helpful'].fillna(0.5)

### Imputación columna recommend
- Transformaremos los datos de recomendaciones en 0 == no recomendado  y 1 == recomendado
- Tambien les daremos el tipo de categorico a la columna

In [10]:
reviews['recommend'] = reviews['recommend'].astype('int').astype(('category'))

### Imputación columna posted
- Vamos a extraer el año que se posteó y el año de la última actualizacionó

In [11]:
reviews['posted'].unique()

reviews['posted'] = reviews['posted'].str.replace('.', '')

reviews['year_posted'] = reviews['posted'].str.extract(r'(\d{4})')
reviews['last_edited'] = reviews['posted'].str.extract(r'(\d{4})')

reviews.rename(columns={'last_edited': 'year_last_edited'},inplace=True)


In [12]:
pd.DataFrame(reviews.isna().sum(),columns=['Nulos'])

Unnamed: 0,Nulos
user_id,0
funny,0
posted,0
year_last_edited,10119
item_id,0
helpful,0
recommend,0
review,0
funny_count,0
percentage_helpful,0


#### Nota:
- Podemos observar que existen 10119 comentarios que no tenian la fecha del año, lamentablemente no tenemos información suficiente para inputar estos registros por lo que vamos proceder a eliminar.

- Notemos que vamos a eliminar el 10% de la data. Este problema se podria solución conocimento un poco más el conjunto de datos, pero por ahora por practicidad y eficiencia en la consulta, procedere a eliminarlas.

In [13]:
reviews.dropna(subset=['posted','year_last_edited'], inplace=True)

In [14]:
columnas = ['item_id', 'year_last_edited','year_posted', 'recommend', 'funny_count', 'percentage_helpful', 'review']

reviews_clear = reviews[columnas]

## Carga de data de user_reviews

In [15]:
path_data_csv =  os.path.join('..','data','clear','user_reviews.csv.gz')
reviews_clear.to_csv(path_data_csv)

# ETL : Steam_games

## Extración de Steam_games

In [40]:
path_data =  os.path.join('..','data','raw','steam_games.json.gz')

steam_games = pd.read_json(path_data, compression='gzip',lines=True)
 

## Transformación de Steam_games

In [41]:
# Eliminacion de filas que todos sus datos son nulos.
steam_games.dropna(how='all',inplace=True)

# Datos duplicados
steam_games[steam_games['id'].duplicated(keep=False)]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88384,,,,,http://store.steampowered.com/,,,,,19.99,0.0,,
102204,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,0.0,612880.0,Machine Games
102883,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,0.0,612880.0,Machine Games
119271,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,"[Single-player, Steam Achievements, Steam Trad...",19.99,0.0,,"Rocksteady Studios,Feral Interactive (Mac)"


**Nota** inspeccionando los datos, vamos a eliminar las siguietes filas.
- 88384 :	en esta fila no encontramos niguna información

- 102204 y 119271 se encuentran repetida 

In [42]:
# Eliminación de datos duplicados

index_duplicados_drop = [88384,102204,119271]
steam_games.drop(index=index_duplicados_drop, axis=1, inplace=True)

In [43]:
# Transforma una lista de elementos en un texto separado por coma
def swap_to_text(x : str):
  """Convierte una lista de strings en un solo texto separado por comas y espacios."""
  if x is not None:
      return ', '.join(x)
  return x


steam_games['genres'] = steam_games['genres'].apply(swap_to_text)
steam_games['tags'] = steam_games['tags'].apply(swap_to_text)
steam_games['specs'] = steam_games['specs'].apply(swap_to_text)

### Tratamiento de datos nulos

In [44]:
pd.DataFrame(steam_games.isna().sum(),columns=['Nulls'])

Unnamed: 0,Nulls
publisher,8051
genres,3282
app_name,1
title,2049
url,0
release_date,2066
tags,162
reviews_url,0
specs,669
price,1377


#### Analisis puntual de nulos

In [57]:
# Eliminaremos las filas con app_name  nulo 
indices_a_borrar = steam_games[steam_games.app_name.isna()].index
steam_games.drop(index=indices_a_borrar, inplace=True)

# Rellenamos los titulos vacios con la columna app_name
steam_games['title'] = steam_games['app_name'].fillna(steam_games['title'])

# Completar los generos vacios con las etiquetas de los Tags
steam_games['genres'] = steam_games['tags'].fillna(steam_games['genres'])

# Si el dato en publisher se encuentrar nulo, entonces vamos a asumir que fue el desarrollador el publicador
steam_games['publisher'].fillna(steam_games['developer'],inplace=True)
steam_games['publisher'].replace('None', steam_games['publisher'].iloc[0],inplace=True)


# Si el dato en desarrollador se encuentrar nulo, entonces vamos a asumir que fue el publicador
steam_games['developer'].fillna(steam_games['publisher'],inplace=True)

# Si no hay datos del desarrollador y el publicador, entonces precederemos llenar 'otros'
steam_games['publisher'].fillna('Otros', inplace=True)
steam_games['developer'].fillna('Otros', inplace=True)

**Nota:** La columan publisher contenia como parte de texto la entrada None, es por esto que remplazaremos este valor por genres

In [46]:
steam_games[steam_games['publisher'] == 'None']

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
91623,,"Indie, Platformer, Shoot 'Em Up, Retro, Action...",Glorkian Warrior: The Trials Of Glork,Glorkian Warrior: The Trials Of Glork,http://store.steampowered.com/app/341120/Glork...,2015-03-24,"Indie, Platformer, Shoot 'Em Up, Retro, Action...",http://steamcommunity.com/app/341120/reviews/?...,"Single-player, Steam Achievements, Full contro...",3.99,0.0,341120.0,Pixeljam
93368,,"Early Access, Massively Multiplayer, Indie, Ac...",Divergence: Online,Divergence: Online,http://store.steampowered.com/app/422940/Diver...,2016-01-06,"Early Access, Massively Multiplayer, Indie, Ac...",http://steamcommunity.com/app/422940/reviews/?...,"Multi-player, MMO, Steam Turn Notifications",19.99,1.0,422940.0,Stained Glass Llama
95281,,"Casual, Puzzle, Sci-fi",Interstellar Logistics Inc,Interstellar Logistics Inc,http://store.steampowered.com/app/502150/Inter...,2016-08-15,"Casual, Puzzle, Sci-fi",http://steamcommunity.com/app/502150/reviews/?...,Single-player,0.99,0.0,502150.0,Exalted Guy Interactive
98116,,"Indie, Adventure, RPG, Sci-fi, Space, Crafting...",SuperCluster: Void,SuperCluster: Void,http://store.steampowered.com/app/610740/Super...,2017-05-15,"Indie, Adventure, RPG, Sci-fi, Space, Crafting...",http://steamcommunity.com/app/610740/reviews/?...,Single-player,4.99,0.0,610740.0,Logan McClure
103125,,"Action, Free to Play, Indie",Scrap,Scrap,http://store.steampowered.com/app/727280/Scrap/,2017-10-14,"Action, Free to Play, Indie",http://steamcommunity.com/app/727280/reviews/?...,"Single-player, Local Multi-Player, Local Co-op...",Free,0.0,727280.0,Top Shelf Studios
106359,,"Action, Indie, Adventure, RPG, Rogue-lite, Pix...",Ruin of the Reckless,Ruin of the Reckless,http://store.steampowered.com/app/516430/Ruin_...,2017-04-26,"Action, Indie, Adventure, RPG, Rogue-lite, Pix...",http://steamcommunity.com/app/516430/reviews/?...,"Single-player, Co-op, Local Co-op, Shared/Spli...",9.99,0.0,516430.0,Faux-Operative Games
109236,,"Action, Indie, Adventure, Platformer",Max Stern,Max Stern,http://store.steampowered.com/app/531240/Max_S...,2016-10-21,"Action, Indie, Adventure, Platformer",http://steamcommunity.com/app/531240/reviews/?...,"Single-player, Steam Achievements, Steam Tradi...",4.99,0.0,531240.0,Lupan Artiom Oleg
115284,,"Strategy, Simulation, Pirates, Sailing, City B...",Pirate's Life,Pirate's Life,http://store.steampowered.com/app/359370/Pirat...,2015-04-17,"Strategy, Simulation, Pirates, Sailing, City B...",http://steamcommunity.com/app/359370/reviews/?...,Single-player,4.99,0.0,359370.0,Team Eyepatch
116737,,"Casual, Indie, Action, Music-Based Procedural ...",Borealis,Borealis,http://store.steampowered.com/app/307170/Borea...,2014-09-02,"Casual, Indie, Action, Music-Based Procedural ...",http://steamcommunity.com/app/307170/reviews/?...,"Single-player, Steam Achievements, Steam Tradi...",4.99,0.0,307170.0,Conrad Nelson


#### Capitalización de texto

In [47]:
# Capitalizacion de texto
columns_capitalize = ['developer', 'publisher', 'app_name','title']

for column in columns_capitalize:
  steam_games[column] = steam_games[column].str.capitalize()
  
# Eliminacion de espacios en blanco
steam_games[columns_capitalize] = steam_games[columns_capitalize].apply(lambda x: x.str.strip())

#### Tratamiento de fechas

In [48]:
# Cambiamos a formato fecha columna release_date y pusumos nulos los que no tenian un formato adecuado para poder establecer fecha

steam_games['release_date'] = pd.to_datetime(steam_games['release_date'], format='%Y-%m-%d', errors='coerce')

In [49]:
# Interpolamos fechas nulas usando interpolate. Este método lo he utilzado porque evita que cambie la distrbución de mis datos 
steam_games['release_date'] = steam_games['release_date'].interpolate()
steam_games['release_date'] = pd.to_datetime(steam_games['release_date'], format='%Y-%m-%d', errors='coerce')

In [50]:
# Crearemos las nuevas columnas 'release_year' y 'release_month'
steam_games['release_year'] = steam_games['release_date'].dt.year
steam_games['release_month'] = steam_games['release_date'].dt.month
steam_games.drop(columns='release_date',inplace=True)

### Analísis de columna Price

In [51]:
def filtro_reg_exp(reg_exp, df):
  "Filtra una columan por una expresion regular reg_exp "
  
  mask = df.str.contains(reg_exp, case=False, na=False)
  return df[mask]

In [52]:
# Encontremos los valores de precio que son de tipo str
filtro_reg_exp(r'\D+' ,steam_games['price']).unique()

array(['Free To Play', 'Free to Play', 'Free', 'Free Demo',
       'Play for Free!', 'Install Now', 'Play WARMACHINE: Tactics Demo',
       'Free Mod', 'Install Theme', 'Third-party', 'Play Now',
       'Free HITMAN™ Holiday Pack', 'Play the Demo',
       'Starting at $499.00', 'Starting at $449.00', 'Free to Try',
       'Free Movie', 'Free to Use'], dtype=object)

In [53]:
## Imputaremoe estos datos a mano, ya que son muy pocos.

input_price = {'Free To Play': 0, 
              'Free to Play': 0, 
              'Free': 0, 
              'Free Demo': 0,
              'Play for Free!': 0,
              'Install Now': 0, 
              'Play WARMACHINE: Tactics Demo': 0,
              'Free Mod': 0, 
              'Install Theme': 0, 
              'Third-party': 0,
              'Play Now': 0,
              'Free HITMAN™ Holiday Pack': 0, 
              'Play the Demo': 0,
              'Starting at $499.00': 499, 
              'Starting at $449.00': 449, 
              'Free to Try': 0,
              'Free Movie': 0, 
              'Free to Use': 0}

#Imputacion de precios
steam_games['price'] = steam_games['price'].replace(input_price)

In [54]:
steam_games.dropna(inplace= True)

In [55]:
steam_games

Unnamed: 0,publisher,genres,app_name,title,url,tags,reviews_url,specs,price,early_access,id,developer,release_year,release_month
88310,Kotoshiro,"Strategy, Action, Indie, Casual, Simulation",Lost summoner kitty,Lost summoner kitty,http://store.steampowered.com/app/761140/Lost_...,"Strategy, Action, Indie, Casual, Simulation",http://steamcommunity.com/app/761140/reviews/?...,Single-player,4.99,0.0,761140.0,Kotoshiro,2018,1
88311,"Making fun, inc.","Free to Play, Strategy, Indie, RPG, Card Game,...",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,"Free to Play, Strategy, Indie, RPG, Card Game,...",http://steamcommunity.com/app/643980/reviews/?...,"Single-player, Multi-player, Online Multi-Play...",0.00,0.0,643980.0,Secret level srl,2018,1
88312,Poolians.com,"Free to Play, Simulation, Sports, Casual, Indi...",Real pool 3d - poolians,Real pool 3d - poolians,http://store.steampowered.com/app/670290/Real_...,"Free to Play, Simulation, Sports, Casual, Indi...",http://steamcommunity.com/app/670290/reviews/?...,"Single-player, Multi-player, Online Multi-Play...",0.00,0.0,670290.0,Poolians.com,2017,7
88313,彼岸领域,"Action, Adventure, Casual",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,"Action, Adventure, Casual",http://steamcommunity.com/app/767400/reviews/?...,Single-player,0.99,0.0,767400.0,彼岸领域,2017,12
88314,Otros,"Action, Indie, Casual, Sports",Log challenge,Log challenge,http://store.steampowered.com/app/773570/Log_C...,"Action, Indie, Casual, Sports",http://steamcommunity.com/app/773570/reviews/?...,"Single-player, Full controller support, HTC Vi...",2.99,0.0,773570.0,Otros,2017,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_rus games,"Strategy, Indie, Casual, Simulation",Colony on mars,Colony on mars,http://store.steampowered.com/app/773640/Colon...,"Strategy, Indie, Casual, Simulation",http://steamcommunity.com/app/773640/reviews/?...,"Single-player, Steam Achievements",1.99,0.0,773640.0,"Nikita ""ghost_rus""",2018,1
120441,Sacada,"Strategy, Indie, Casual",Logistical: south africa,Logistical: south africa,http://store.steampowered.com/app/733530/LOGis...,"Strategy, Indie, Casual",http://steamcommunity.com/app/733530/reviews/?...,"Single-player, Steam Achievements, Steam Cloud...",4.99,0.0,733530.0,Sacada,2018,1
120442,Laush studio,"Indie, Simulation, Racing",Russian roads,Russian roads,http://store.steampowered.com/app/610660/Russi...,"Indie, Simulation, Racing",http://steamcommunity.com/app/610660/reviews/?...,"Single-player, Steam Achievements, Steam Tradi...",1.99,0.0,610660.0,Laush dmitriy sergeevich,2018,1
120443,Sixnails,"Indie, Casual, Puzzle, Singleplayer, Atmospher...",Exit 2 - directions,Exit 2 - directions,http://store.steampowered.com/app/658870/EXIT_...,"Indie, Casual, Puzzle, Singleplayer, Atmospher...",http://steamcommunity.com/app/658870/reviews/?...,"Single-player, Steam Achievements, Steam Cloud",4.99,0.0,658870.0,"Xropi,stev3ns",2017,9


## Carga de data de Steam_games

In [58]:
path_data_csv =  os.path.join('..','data','clear','steam_games.csv.gz')

steam_games.to_csv(path_data_csv, index=False)