In [18]:
import json
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option('display.max_columns', None)


def is_nan(value):
    """ Comprueba si un valor es NaN. """
    try:
        return value != value
    except TypeError:
        return False

# Ruta al archivo JSON original
input_file_path = '../data/raw/steam_games.json'

# Ruta para el nuevo archivo JSON limpio
output_file_path = '../data/interim/cleaned_steam_games.json'

with open(input_file_path, 'r', encoding='utf-8') as input_file, \
     open(output_file_path, 'w', encoding='utf-8') as output_file:

    for line in input_file:
        record = json.loads(line)
        if not all(is_nan(value) for value in record.values()):
            json.dump(record, output_file)
            output_file.write('\n')

print("Archivo steam_games limpiado y guardado.")



Archivo steam_games limpiado y guardado.


In [20]:
# Cargar el archivo JSON en un DataFrame
file_path = '../data/interim/cleaned_steam_games.json'
data = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            record = json.loads(line)
            data.append(record)
        except json.JSONDecodeError:
            continue
df = pd.DataFrame(data)

# Manejar valores nulos en 'genres'
df['genres'] = df['genres'].apply(lambda x: x if isinstance(x, list) else [])

# One-Hot Encoding para la columna 'genres'
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(df['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
genres_df = genres_df.add_suffix('_genre')  # Agregar sufijo a las columnas de género

# Unir el DataFrame codificados con el original y eliminamos la columna genres
df_encoded = df.join(genres_df)
df_encoded = df_encoded.drop('genres', axis=1)

# Convertir la columna de fecha de lanzamiento a tipo de datos de fecha y extraer el año
df_encoded['release_date'] = pd.to_datetime(df_encoded['release_date'], errors='coerce')
df_encoded['release_year'] = df_encoded['release_date'].dt.year

# Mostrar las primeras filas del DataFrame resultante
df_encoded


Unnamed: 0,publisher,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,Accounting_genre,Action_genre,Adventure_genre,Animation &amp; Modeling_genre,Audio Production_genre,Casual_genre,Design &amp; Illustration_genre,Early Access_genre,Education_genre,Free to Play_genre,Indie_genre,Massively Multiplayer_genre,Photo Editing_genre,RPG_genre,Racing_genre,Simulation_genre,Software Training_genre,Sports_genre,Strategy_genre,Utilities_genre,Video Production_genre,Web Publishing_genre,release_year
0,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,2018.0
1,"Making Fun, Inc.",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,2018.0
2,Poolians.com,Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,2017.0
3,彼岸领域,弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2017.0
4,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,NaT,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32130,Ghost_RUS Games,Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS""",0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,2018.0
32131,Sacada,LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2018.0
32132,Laush Studio,Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,2018.0
32133,SIXNAILS,EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2017.0


User_reviews

In [None]:
def correct_json_format(line):
    # Intenta corregir claves sin comillas dobles
    corrected_line = re.sub(r'(?<!")(\w+)(?=:)', r'"\1"', line)
    return corrected_line

# Ruta del archivo JSON original y del nuevo archivo corregido
input_file_path = '../data/interim/sample_user_reviews.json'
output_file_path = '../data/interim/cleaned_user_reviews.json'

# Leer el archivo original y escribir el archivo corregido
with open(input_file_path, 'r', encoding='utf-8') as input_file, \
     open(output_file_path, 'w', encoding='utf-8') as output_file:
    for line in input_file:
        corrected_line = correct_json_format(line.strip())
        try:
            # Intenta decodificar la línea corregida para verificar su validez
            json.loads(corrected_line)
            output_file.write(corrected_line + '\n')
        except json.JSONDecodeError:
            # Si la línea corregida no es un JSON válido, escribe la línea original
            output_file.write(line)

print("Archivo JSON corregido y guardado.")

In [22]:
# Ruta al archivo JSON original
input_file_path = '../data/raw/user_reviews.json'

# Ruta para el nuevo archivo JSON limpio
output_file_path = '../data/interim/cleaned_user_reviews.json'

total_records_before = 0
nan_records_before = 0
total_records_after = 0

# Contar y limpiar los registros
with open(input_file_path, 'r', encoding='utf-8') as input_file, \
     open(output_file_path, 'w', encoding='utf-8') as output_file:

    for line in input_file:
        total_records_before += 1
        record = json.loads(line)
        if all(is_nan(value) for value in record.values()):
            nan_records_before += 1
        else:
            json.dump(record, output_file)
            output_file.write('\n')
            total_records_after += 1

print("Archivo user_reviews limpiado y guardado.")
print(f"Total registros antes de la limpieza: {total_records_before}")
print(f"Registros con todos los campos en NaN antes de la limpieza: {nan_records_before}")
print(f"Total registros después de la limpieza: {total_records_after}")


JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)