**Importación de archivos**

In [1]:
import pandas as pd
import ast
import json
import gzip
from textblob import TextBlob

In [2]:
# Importamos los archivos JSON

# Lista para almacenar los diccionarios JSON de cada línea
data_list = []

# Ruta del archivo JSON
file_path = 'data/australian_user_reviews.json'

# Abrir el archivo y procesar cada línea
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            # Usar ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(line)
            data_list.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

# Crear un DataFrame a partir de la lista de diccionarios
df_user_reviews = pd.DataFrame(data_list)

In [2]:
# Ruta del archivo JSON comprimido en formato gz
gz_file_path = 'data/australian_users_items.json.gz'

# Lista para almacenar los diccionarios JSON de cada línea
data_list = []

# Abrir el archivo gz y procesar cada línea
with gzip.open(gz_file_path, 'rt') as file:
    for line in file:
        try:
            # Usar ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(line)
            data_list.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

# Crear un DataFrame a partir de la lista de diccionarios
df_users_items = pd.DataFrame(data_list)

In [3]:
df_users_items = df_users_items[:2000000]

In [4]:
# Para este archivo tuve que utilizar un método distinto

# Leer el archivo línea por línea y cargar cada línea como un objeto JSON
with open('data\output_steam_games.json', 'r', encoding='utf-8') as archivo:
    data = [json.loads(line) for line in archivo]

# Convertir la lista de objetos JSON en un DataFrame
df_steam_games = pd.DataFrame(data)

**Corrección de DataFrames**

In [4]:
# Comprobamos cantidad de filas y columnas para cada DataFrame
print(
f'''
df_user_reviews
{df_user_reviews.shape}

df_users_items
{df_users_items.shape}

df_steam_games
{df_steam_games.shape}
''') 

NameError: name 'df_user_reviews' is not defined

In [6]:
# Limpiamos el DataFrame de Steam Games de todos los datos sobrantes

df_steam_games_clean = df_steam_games[df_steam_games['genres'].notna()]

In [7]:
df_steam_games_clean.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com
88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域
88315,Trickjump Games Ltd,"[Action, Adventure, Simulation]",Battle Royale Trainer,Battle Royale Trainer,http://store.steampowered.com/app/772540/Battl...,2018-01-04,"[Action, Adventure, Simulation, FPS, Shooter, ...",http://steamcommunity.com/app/772540/reviews/?...,"[Single-player, Steam Achievements]",3.99,False,772540,Trickjump Games Ltd


In [8]:
# Comprobamos cantidad de filas y columnas del DataFrame
df_steam_games_clean.shape

(28852, 13)

In [9]:
# Desanidamos los dos DataFrames restantes

# Crear una lista para almacenar los registros desglosados de items
reviews_records = []

# Recorrer el DataFrame original y desglosar los elementos de items
for index, row in df_user_reviews.iterrows():
    user_id = row['user_id'] # Agregar el user_id 
    reviews_list = row['reviews'] # Extraer todos los datos de la coumna items de df_user_reviews
    for reviews in reviews_list:
        reviews_record = {
            'user_id': user_id,
            'item_id': reviews['item_id'],
            'posted': reviews['posted'],
            'recommend':reviews['recommend'],
            'review': reviews['review'],
        }
        reviews_records.append(reviews_record)

# Crear el DataFrame con los registros desglosados de items
df_reviews_info = pd.DataFrame(reviews_records)

In [10]:
# Visualizamos el DataFrame
df_reviews_info.head()

Unnamed: 0,user_id,item_id,posted,recommend,review
0,76561197970982479,1250,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...
1,76561197970982479,22200,"Posted July 15, 2011.",True,It's unique and worth a playthrough.
2,76561197970982479,43110,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,251610,"Posted June 24, 2014.",True,I know what you think when you see this title ...
4,js41637,227300,"Posted September 8, 2013.",True,For a simple (it's actually not all that simpl...


In [11]:
def analizar_sentimiento(texto):
    analysis = TextBlob(texto)
    if analysis.sentiment.polarity < 0:
        return 0  # Sentimiento negativo
    elif analysis.sentiment.polarity == 0:
        return 1  # Sentimiento neutral
    else:
        return 2  # Sentimiento positivo

In [12]:
df_reviews_info['sentiment_analysis'] = df_reviews_info['review'].apply(analizar_sentimiento)

In [13]:
df_reviews_info.tail()

Unnamed: 0,user_id,item_id,posted,recommend,review,sentiment_analysis
59300,76561198312638244,70,Posted July 10.,True,a must have classic from steam definitely wort...,2
59301,76561198312638244,362890,Posted July 8.,True,this game is a perfect remake of the original ...,2
59302,LydiaMorley,273110,Posted July 3.,True,had so much fun plaing this and collecting res...,2
59303,LydiaMorley,730,Posted July 20.,True,:D,2
59304,LydiaMorley,440,Posted July 2.,True,so much fun :D,2


In [5]:
# Crear una lista para almacenar los registros desglosados de items
items_records = []

# Recorrer el DataFrame original y desglosar los elementos de items
for index, row in df_users_items.iterrows():
    user_id = row['user_id'] # Agregar el user_id 
    items_list = row['items'] # Extraer todos los datos de la coumna items de df_users_items 
    for item in items_list:
        item_record = {
            'user_id': user_id,
            'item_id': item['item_id'],
            'item_name': item['item_name'],
            'playtime_forever': item['playtime_forever'],
            'playtime_2weeks': item['playtime_2weeks']
        }
        items_records.append(item_record)

# Crear el DataFrame con los registros desglosados de items
df_items_info = pd.DataFrame(items_records)

In [6]:
# Visualizamos el DataFrame
df_items_info.head()

Unnamed: 0,user_id,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,10,Counter-Strike,6,0
1,76561197970982479,20,Team Fortress Classic,0,0
2,76561197970982479,30,Day of Defeat,7,0
3,76561197970982479,40,Deathmatch Classic,0,0
4,76561197970982479,50,Half-Life: Opposing Force,0,0


**Exportación de Archivos**

In [11]:
#optimizar datos

df_items_info = df_items_info[:1000000]
#df_reviews_info = df_reviews_info[:20000]
#df_steam_games_clean = df_steam_games_clean[:20000]

In [18]:
# Con los DataFrames descomprimidos los exportamos en limpio para poder comenzar con los procesos de ETL y EDA

df_steam_games_clean.to_csv('data/output/steam_games.csv', index=False)

In [19]:
df_reviews_info.to_csv('data/output/reviews.csv', index=False)

In [12]:
df_items_info.to_csv('data/output/items.csv', index=False)