In [2]:
# importamos las librerías
import json
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import re

In [92]:
# ver todas las columnas de los dataframes tratados
pd.set_option('display.max_columns', None)


def is_nan(value):
    """ Comprueba si un valor es NaN. """
    try:
        return value != value
    except TypeError:
        return False

# Ruta al archivo JSON original
input_file_path = '../data/raw/steam_games.json'

# Ruta para el nuevo archivo JSON limpio
output_file_path = '../data/interim/cleaned_steam_games.json'

with open(input_file_path, 'r', encoding='utf-8') as input_file, \
     open(output_file_path, 'w', encoding='utf-8') as output_file:

    for line in input_file:
        record = json.loads(line)
        if not all(is_nan(value) for value in record.values()):
            json.dump(record, output_file)
            output_file.write('\n')

print("Archivo steam_games limpiado y guardado.")

Archivo steam_games limpiado y guardado.


In [93]:
# Cargar el archivo JSON en un DataFrame
file_path = '../data/interim/cleaned_steam_games.json'
data = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            record = json.loads(line)
            data.append(record)
        except json.JSONDecodeError:
            continue
df = pd.DataFrame(data)

# Manejar valores nulos en 'genres'
df['genres'] = df['genres'].apply(lambda x: x if isinstance(x, list) else [])

# Convertir la columna de fecha de lanzamiento a tipo de datos de fecha y extraer el año
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year

# Explode 'genres' para tener una fila por género
df_games = df.explode('genres')

# Ahora, cada fila tendrá un único género, manteniendo los demás datos del juego
df_games.reset_index(drop=True, inplace=True)

# Mostrar las primeras filas del DataFrame resultante
df_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,release_year
0,Kotoshiro,Action,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
1,Kotoshiro,Casual,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
2,Kotoshiro,Indie,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
3,Kotoshiro,Simulation,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
4,Kotoshiro,Strategy,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74832,Laush Studio,Racing,Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich,2018.0
74833,Laush Studio,Simulation,Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich,2018.0
74834,SIXNAILS,Casual,EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",2017.0
74835,SIXNAILS,Indie,EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",2017.0


In [94]:
# Dejamos solo las columnas que usaremos para las consultas de los endpoints y para el modelo de ML
df_games = df_games.drop(['publisher', 'title', 'url', 'release_date', 'tags', 'reviews_url', 'specs', 'early_access'], axis=1)

# Mostrar como quedaría el dataframe
df_games.head()

Unnamed: 0,genres,app_name,price,id,developer,release_year
0,Action,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018.0
1,Casual,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018.0
2,Indie,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018.0
3,Simulation,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018.0
4,Strategy,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018.0


In [95]:
# Guardamos el dataframe procesado para los análisis posteriores
output_file_path = '../data/processed/processed_steam_games.csv'
df_games.to_csv(output_file_path, index=False)

output_file_path

'../data/processed/processed_steam_games.csv'

Preprocesamiento del archivo users_reviews.json

In [6]:

def convert_to_json_like(text):
    """
    Convert the text to a format resembling a valid JSON by replacing single quotes with double quotes
    and converting boolean values to lowercase.
    """
    text = text.replace("\'", "\"")
    text = text.replace(" True", " true")
    text = text.replace(" False", " false")
    return text

def escape_internal_quotes(json_like_text):
    """
    Escape internal double quotes within strings in the JSON-like text.
    """
    escaped_text = re.sub(r'(?<!\\)"(?=[^"]*"[^"]*":)', '\\"', json_like_text)
    return escaped_text

def extract_data_line_by_line_trimmed(file_content):
    """
    Extract and trim key pieces of information from the file, ensuring no trailing slashes or characters.
    """
    user_id_pattern = re.compile(r'"user_id":\s*"([^"]+?)"(?=\s*,|\s*})')
    user_url_pattern = re.compile(r'"user_url":\s*"([^"]+?)"(?=\s*,|\s*})')
    item_id_pattern = re.compile(r'"item_id":\s*"([^"]+?)"(?=\s*,|\s*})')
    posted_pattern = re.compile(r'"posted":\s*"([^"]+?)"(?=\s*,|\s*})')
    review_pattern = re.compile(r'"review":\s*"([^"]+?)"(?=\s*,|\s*})')

    extracted_data = []
    current_data = {}

    for line in file_content.split('\n'):
        user_id_match = user_id_pattern.search(line)
        user_url_match = user_url_pattern.search(line)
        item_id_match = item_id_pattern.search(line)
        posted_match = posted_pattern.search(line)
        review_match = review_pattern.search(line)

        if user_id_match:
            current_data['user_id'] = user_id_match.group(1).rstrip('\\')
        if user_url_match:
            current_data['user_url'] = user_url_match.group(1).rstrip('\\')
        if item_id_match:
            current_data['item_id'] = item_id_match.group(1).rstrip('\\')
        if posted_match:
            current_data['posted'] = posted_match.group(1).rstrip('\\')
        if review_match:
            current_data['review'] = review_match.group(1).rstrip('\\')
            extracted_data.append(current_data.copy())
            current_data.clear()

    return extracted_data

# Read the file
file_path = '../data/raw/user_reviews.json'  # Replace with your file path
with open(file_path, 'r') as file:
    content = file.read()

# Convert and escape content
json_like_text = convert_to_json_like(content)
escaped_text = escape_internal_quotes(json_like_text)

# Extract data and convert to DataFrame
extracted_data = extract_data_line_by_line_trimmed(escaped_text)
df_reviews = pd.DataFrame(extracted_data)

# Display the DataFrame
df_reviews


Unnamed: 0,user_id,user_url,item_id,posted,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,"Posted November 5, 2011.",Great atmosphere. The gunplay can be a bit chu...
1,js41637,http://steamcommunity.com/id/js41637,251610,"Posted June 24, 2014.",Very fun little game to play when your bored o...
2,evcentric,http://steamcommunity.com/id/evcentric,248820,Posted February 3.,"Elegant integration of gameplay, story, world ..."
3,doctr,http://steamcommunity.com/id/doctr,250320,"Posted October 14, 2013.",This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,211420,"Posted April 15, 2014.",Git gud
...,...,...,...,...,...
22339,JustMielThings,http://steamcommunity.com/id/JustMielThings,570,Posted May 20.,Good one
22340,Ghoustik,http://steamcommunity.com/id/Ghoustik,730,Posted June 17.,Gra naprawdę fajna.Ale jest kilka rzeczy do kt...
22341,76561198310819422,http://steamcommunity.com/profiles/76561198310...,570,Posted June 23.,Well Done
22342,76561198312638244,http://steamcommunity.com/profiles/76561198312...,233270,Posted July 21.,this is a very fun and nice 80s themed shooter...


In [7]:
from textblob import TextBlob

def classify_sentiment(review_text):
    """
    Classify the sentiment of the review text.
    Returns 0 for negative, 1 for neutral, and 2 for positive.
    Converts non-string input to string.
    """
    review_text = str(review_text)  # Convert to string to handle non-string inputs
    if review_text == 'nan':
        return 1  # Neutral for null or empty reviews

    # Analyzing the sentiment
    analysis = TextBlob(review_text)
    polarity = analysis.sentiment.polarity

    # Classifying based on polarity
    if polarity < -0.1:  # Negative sentiment
        return 0
    elif polarity > 0.1:  # Positive sentiment
        return 2
    else:
        return 1  # Neutral sentiment

# Apply sentiment classification to the DataFrame
df_reviews['sentiment'] = df_reviews['review'].apply(classify_sentiment)

# Display the updated DataFrame
df_reviews


Unnamed: 0,user_id,user_url,item_id,posted,review,sentiment
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,"Posted November 5, 2011.",Great atmosphere. The gunplay can be a bit chu...,1
1,js41637,http://steamcommunity.com/id/js41637,251610,"Posted June 24, 2014.",Very fun little game to play when your bored o...,1
2,evcentric,http://steamcommunity.com/id/evcentric,248820,Posted February 3.,"Elegant integration of gameplay, story, world ...",2
3,doctr,http://steamcommunity.com/id/doctr,250320,"Posted October 14, 2013.",This game... is so fun. The fight sequences ha...,2
4,maplemage,http://steamcommunity.com/id/maplemage,211420,"Posted April 15, 2014.",Git gud,1
...,...,...,...,...,...,...
22339,JustMielThings,http://steamcommunity.com/id/JustMielThings,570,Posted May 20.,Good one,2
22340,Ghoustik,http://steamcommunity.com/id/Ghoustik,730,Posted June 17.,Gra naprawdę fajna.Ale jest kilka rzeczy do kt...,1
22341,76561198310819422,http://steamcommunity.com/profiles/76561198310...,570,Posted June 23.,Well Done,1
22342,76561198312638244,http://steamcommunity.com/profiles/76561198312...,233270,Posted July 21.,this is a very fun and nice 80s themed shooter...,2


In [8]:
# Dejamos solo las columnas que usaremos para el análisis y los endpoints
df_reviews = df_reviews.drop(['user_url', 'posted', 'review'], axis=1)

# Mostrar como quedaría el dataframe
df_reviews.head()

Unnamed: 0,user_id,item_id,sentiment
0,76561197970982479,1250,1
1,js41637,251610,1
2,evcentric,248820,2
3,doctr,250320,2
4,maplemage,211420,1


In [9]:
# Guardamos el dataframe procesado para los análisis posteriores
output_file_path = '../data/processed/processed_users_reviews.csv'
df_reviews.to_csv(output_file_path, index=False)

output_file_path

'../data/processed/processed_users_reviews.csv'

Preprocesamiento del archivo user_items.json

In [77]:
import ast

def load_and_process_file(file_path):
    data_list = []
    with open(file_path, encoding='utf-8') as file:
        for line in file.readlines():
            data_list.append(ast.literal_eval(line))
    df = pd.DataFrame(data_list)
    return df

def explode_items_column(df):
    exploded_df = df.explode('items')
    return exploded_df

# Load and process the file
file_path = '../data/raw/users_items.json'  # Replace with your actual file path
df = load_and_process_file(file_path)

# Explode the 'items' column
exploded_df = explode_items_column(df)

# Converting the dictionary in the 'items' column into separate columns
df_users = pd.concat([exploded_df.drop(['items'], axis=1), exploded_df['items'].apply(pd.Series)], axis=1)

# Display the first few rows of the expanded DataFrame
df_users

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks,0
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0,
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0.0,0.0,
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7.0,0.0,
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0.0,0.0,
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0.0,0.0,
...,...,...,...,...,...,...,...,...,...
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,373330,All Is Dust,0.0,0.0,
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,388490,One Way To Die: Steam Edition,3.0,3.0,
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,521570,You Have 10 Seconds 2,4.0,4.0,
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,519140,Minds Eyes,3.0,3.0,


In [82]:
# Dejamos solo las columnas que usaremos para el análisis y los endpoints
df_users = df_users.drop(['steam_id', 'user_url', 'item_name', 'playtime_2weeks', 0], axis=1)

# Mostrar como quedaría el dataframe
df_users.head()

Unnamed: 0,user_id,items_count,item_id,playtime_forever
0,76561197970982479,277,10,6.0
0,76561197970982479,277,20,0.0
0,76561197970982479,277,30,7.0
0,76561197970982479,277,40,0.0
0,76561197970982479,277,50,0.0


In [88]:
# Guardamos el dataframe procesado para los análisis posteriores
output_file_path = '../data/processed/processed_users_items.csv'
df_users.to_csv(output_file_path, index=False)

output_file_path

'../data/processed/processed_users_items.csv'