# MLOps Steam


Buenas y bienvenidos a este Notebook donde haremos el proceso de ETL a 3 datasets brindados por la plataforma de juegos Steam donde nosotros podremos practicar y brindar una solucion al problema que estan teniendo. Una vez que tratemos los datos nuestro objetivo sera hacer un analisis exploratorio de los datos y a raiz de esto sacar un modelo funcional de inteligencia artificial, que podra ser consumida desde una api por Render.


Comenzemos con la lectura de los datos y la limpieza de los mismos.

In [510]:
#instalamos todas las librerias necesarias
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import ast

In [511]:
import json

data = []
with open('data/output_steam_games.json', 'r') as f:
    for line in f:
        try:
            obj = json.loads(line)
            data.append(obj)
        except json.JSONDecodeError as e:
            print("Error en línea:", line)

# Convierte la lista de objetos en un DataFrame
steam = pd.DataFrame(data)

# Imprime el DataFrame
print(steam.shape)
steam.head()


(120445, 13)


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [512]:
steam = steam.dropna(thresh=3)
print(steam.shape)
steam.head(3)

(32135, 13)


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com


In [513]:
rows = []
with open('data/australian_users_items.json', 'r', encoding='UTF-8') as f:
    for line in f.readlines():
        rows.append(ast.literal_eval(line))

In [514]:
user_items = pd.DataFrame(rows)

In [515]:
user_items = user_items.dropna(thresh=3)
print(user_items.shape)
user_items.head(3)

(88310, 5)


Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."


In [516]:
rows = []
with open('data/australian_user_reviews.json', 'r', encoding='UTF-8') as f:
    for line in f.readlines():
        rows.append(ast.literal_eval(line))

In [517]:
user_reviews = pd.DataFrame(rows)

In [518]:
user_reviews = user_reviews.dropna(thresh=3)
print(user_reviews.shape)
user_reviews.head(3)


(25799, 3)


Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."


In [519]:
data_desanidada = []

for index, row in user_items.iterrows():
    user_id = row['user_id']
    items_count = row['items_count']
    steam_id = row['steam_id']
    user_url = row['user_url']
    items = row['items']
    
    for i in items:   
        new_row = {
        'user_id': user_id,
        'items_count': items_count,
        'steam_id' : steam_id,
        'user_url' : user_url,
        'item_id': i.get('item_id', ''),
        'item_name': i.get('item_name', ''),
        'playtime_forever': i.get('playtime_forever', ''),
        'playtime_2weeks': i.get('playtime_2weeks', '')
        }
        
        data_desanidada.append(new_row)

user_items_completo = pd.DataFrame(data_desanidada)

In [520]:
data_desanidada = []

for index, row in user_reviews.iterrows():
    user_id = row['user_id']
    user_url = row['user_url']
    reviews = row['reviews']
    
    for i in reviews:   
        new_row = {
        'user_id': user_id,
        'user_url': user_url,
        'reviews' : reviews,
        'funny': i.get('funny', ''),
        'posted': i.get('posted', ''),
        'last_edited': i.get('last_edited', ''),
        'item_id': i.get('item_id', ''),
        'helpful': i.get('helpful', ''),
        'recommend': i.get('recommend', bool),
        'review': i.get('review', '')
        }
        
        data_desanidada.append(new_row)

user_reviews_completo = pd.DataFrame(data_desanidada)

### En este momento ya poseemos los 3 dataframes necesarios para comenzar a trabajar a responder las preguntas solicitadas, vamos a ello una por una


In [521]:
steam_reducido = steam[['price', 'id', 'title']]
steam_reducido['price'] = steam_reducido['price'].apply(pd.to_numeric, errors='coerce')
steam_reducido.price = steam_reducido.price.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  steam_reducido['price'] = steam_reducido['price'].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  steam_reducido.price = steam_reducido.price.fillna(0)



### ENDPOINT1


In [522]:
# Combinamos dfs por item
merge1 = user_items_completo.merge(steam_reducido, left_on='item_id', right_on='id', how='inner')

# combinamos el df de reviews con el anterior por usuario y item
merge1 = merge1.merge(user_reviews_completo, on=['user_id', 'item_id'], how='inner')

# lo agrupamos por user_id
df_agrupado1 = merge1.groupby('user_id')

# Calcular la cantidad de items que posee cada usuario
items_ep_1 = df_agrupado1['item_id'].count()

# Calcular el porcentaje positivo de recommend para cada usuario
porcentaje_recommend = (df_agrupado1['recommend'].sum() / df_agrupado1['user_id'].count()) * 100

# Calcular la suma de precios por usuario
total_precio = df_agrupado1['price'].sum()

# Crear una nueva tabla con la información
endpoint1 = pd.DataFrame({'items': items_ep_1, 'porcentaje_recomendados': porcentaje_recommend, 'precio_total': total_precio})

endpoint1.to_csv('data_endpoints/endpoint1.csv')

In [574]:
def userdata(user_id):
    if user_id in endpoint1.index:
        items = endpoint1.loc[user_id]['items']
        precio = endpoint1.loc[user_id]['precio_total']
        recommend = endpoint1.loc[user_id]['porcentaje_recomendados']
        return items, precio, recommend
    else:
        return None

In [575]:
userdata('--000--')

(1.0, 19.99, 100.0)

### ENDPOINT 2

In [525]:
def convertir_fecha(fecha_texto):
    # Utilizar expresiones regulares para extraer el mes, día y año
    match = re.search(r'(\w+) (\d+), (\d+)', fecha_texto)
    if match:
        mes = match.group(1)
        dia = match.group(2)
        año = match.group(3)
        
        # Mapear nombres de meses a números de meses
        meses = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06',
                 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'}
        
        # Crear una cadena en formato 'YYYY-MM-DD' y convertirla a objeto de fecha
        fecha_str = f'{año}-{meses[mes]}-{dia}'
        return pd.to_datetime(fecha_str)
    else:
        return None

# Aplicar la función de conversión a la columna 'fecha_texto' y crear una nueva columna 'fecha'
user_reviews_completo['posted'] = user_reviews_completo['posted'].apply(convertir_fecha)

In [526]:
endpoint2 = user_reviews_completo[['user_id', 'posted', 'recommend']]
endpoint2.to_csv('data_endpoints/endpoint2.csv')

In [527]:
def countreviews(fecha_inicio, fecha_final):
    filtro_fechas = (endpoint2['posted'] >= fecha_inicio) & (endpoint2['posted'] <= fecha_final)
    df_fechas_filtrado = user_reviews_completo[filtro_fechas]
    cantidad_usuarios = df_fechas_filtrado['user_id'].nunique()
    cantidad_trues = df_fechas_filtrado[df_fechas_filtrado['recommend'] == True].shape[0]
    porcentaje = (cantidad_trues / cantidad_usuarios) * 100
    return f('Cantidad de usuarios: {cantidad_usuarios}, Porcentaje de Trues: {porcentaje}')

In [528]:
countreviews('2011-11-05', '2013-09-08')

(2167, 149.93077988001846)

### ENDPOINT 3

In [529]:
steam['genres'].fillna('[]')

88310         [Action, Casual, Indie, Simulation, Strategy]
88311                  [Free to Play, Indie, RPG, Strategy]
88312     [Casual, Free to Play, Indie, Simulation, Sports]
88313                           [Action, Adventure, Casual]
88314                                                    []
                                ...                        
120440                [Casual, Indie, Simulation, Strategy]
120441                            [Casual, Indie, Strategy]
120442                          [Indie, Racing, Simulation]
120443                                      [Casual, Indie]
120444                                                   []
Name: genres, Length: 32135, dtype: object

In [530]:
generos_unicos = set()  # Usamos un conjunto para asegurarnos de que no haya duplicados

for index, row in steam.iterrows():
    genres = row['genres']
    if isinstance(genres, list):
        generos_unicos.update(genres)

generos_unicos = list(generos_unicos)

In [531]:
ids_por_genero = {genero: [] for genero in generos_unicos}

for index, row in steam.iterrows():
    genres = row['genres']
    if isinstance(genres, list):
        for genero in genres:
            ids_por_genero[genero].append(row['id'])

In [532]:
resultados = []

# Iterar sobre cada género en el diccionario
for genero, ids_juegos in ids_por_genero.items():
    # Filtrar el DataFrame de tiempo jugado para incluir solo los IDs de juegos del género actual
    df_genero = user_items_completo[user_items_completo['item_id'].isin(ids_juegos)]
    
    # Calcular el tiempo total jugado para el género actual
    tiempo_total = df_genero['playtime_forever'].sum()
    
    # Agregar el resultado a la lista
    resultados.append({'genres': genero, 'TiempoTotal': tiempo_total})

# Crear un DataFrame a partir de la lista de resultados
df_resultados = pd.DataFrame(resultados)

# Ordenar el DataFrame por tiempo total jugado en orden descendente
endpoint3 = df_resultados.sort_values(by='TiempoTotal', ascending=False)

endpoint3['Puesto'] = endpoint3['TiempoTotal'].rank(ascending=False, method='min').astype(int)

In [577]:
endpoint3.to_csv('data_endpoints/endpoint3.csv')
endpoint3

Unnamed: 0,genres,TiempoTotal,Puesto
19,Action,3113562606,1
12,Indie,1494622404,2
16,RPG,1041022718,3
17,Adventure,909995120,4
5,Simulation,867646306,5
15,Strategy,659363841,6
3,Free to Play,610752945,7
21,Massively Multiplayer,446594080,8
18,Casual,252232854,9
1,Early Access,158701268,10


In [534]:
def genre(genero):
    resultado = endpoint3[endpoint3['genres'] == genero]
    valor = resultado['Puesto'].values
    return valor[0]

In [535]:
genre('Simulation')

5

### ENDPOINT 4

-----------


In [536]:
user_items_completo.head(1)

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6,0


In [537]:
steam.rename(columns={'id':'item_id'}, inplace=True)

In [538]:
steam.head(1)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,item_id,developer
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro


In [539]:
steam_ep4 = steam[['item_id', 'genres']]

In [540]:
merged2 = user_items_completo[['user_id', 'item_id', 'user_url', 'playtime_forever']].merge(steam_ep4.explode('genres'), on='item_id')
groupep4 = merged2.groupby(['user_id', 'genres'])['playtime_forever'].sum().reset_index()
sorted_ep4 = groupep4.sort_values(by=['genres', 'playtime_forever'], ascending=[True, False])
endpoint4 = sorted_ep4.groupby('genres').head(5)



In [541]:
endpoint4.to_csv('data_endpoints/endpoint4.csv')
endpoint4[endpoint4['genres'] == 'Action']

Unnamed: 0,user_id,genres,playtime_forever
486214,Sp3ctre,Action,1699307
635657,shinomegami,Action,1580428
472869,REBAS_AS_F-T,Action,1456212
492828,Terminally-Chill,Action,1065742
413434,DownSyndromeKid,Action,1061193


In [542]:
def userforgenre(genre):
    return endpoint4[endpoint4['genres'] == genre]

userforgenre('Simulation')

Unnamed: 0,user_id,genres,playtime_forever
578865,jimmynoe,Simulation,1062130
534994,clawbot44,Simulation,798416
418438,Evilutional,Simulation,684723
472883,REBAS_AS_F-T,Simulation,676540
656575,tsunamitad,Simulation,661309


------------------------------------


### ENDPOINT 5

----------------------------------------------------


In [548]:
steam

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,item_id,developer
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",0.00,False,643980,Secret Level SRL
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",0.00,False,670290,Poolians.com
88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域
88314,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,NaT,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns"


In [543]:
steam['release_date'] = pd.to_datetime(steam['release_date'], format='%Y-%m-%d', errors='coerce')

In [544]:
steam['price'] = steam['price'].apply(pd.to_numeric, errors='coerce')
steam.price = steam.price.fillna(0)

In [553]:
endpoint5 = steam[['release_date', 'developer', 'price']]
endpoint5.to_csv('data_endpoints/endpoint5.csv')

In [545]:

def developer(desarrollador):
     # Filtrar el DataFrame para obtener solo los juegos del desarrollador especificado
    juegos_del_desarrollador = steam[steam['developer'] == desarrollador]
    
    # Inicializar un diccionario para almacenar los porcentajes por año
    porcentajes_por_anio = {}
    
    juegos_del_desarrollador['release_date'] = juegos_del_desarrollador['release_date'].dt.year

    # Obtener la lista de años únicos
    años_unicos = juegos_del_desarrollador['release_date'].unique()
    
    # Calcular el porcentaje de juegos gratis para cada año
    for año in años_unicos:
        juegos_del_año = juegos_del_desarrollador[juegos_del_desarrollador['release_date'] == año]
        juegos_gratis_del_año = juegos_del_año[juegos_del_año['price'] == 0]
        
        porcentaje_juegos_gratis = (len(juegos_gratis_del_año) / len(juegos_del_año)) * 100
        
        porcentajes_por_anio[año] = porcentaje_juegos_gratis
    
    return porcentajes_por_anio




In [550]:
developer('Nikita "Ghost_RUS"')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  juegos_del_desarrollador['release_date'] = juegos_del_desarrollador['release_date'].dt.year


{2017: 0.0, 2018: 0.0}

### ENDPOINT 6

In [546]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')  # Descargar el léxico para SentimentIntensityAnalyzer

def analizar_sentimiento_por_año(año):
    sia = SentimentIntensityAnalyzer()

    # Inicializar contadores
    positivas = 0
    neutras = 0
    negativas = 0

    for index, row in user_reviews_completo.iterrows():
        if row['posted'].year == año:
            review = row['review']

            # Realizar análisis de sentimiento
            sentiment_score = sia.polarity_scores(review)['compound']

            # Clasificar la reseña en positiva, neutra o negativa
            if sentiment_score >= 0.05:
                positivas += 1
            elif sentiment_score <= -0.05:
                negativas += 1
            else:
                neutras += 1

    return positivas, neutras, negativas

año = 2014

positivas, neutras, negativas = analizar_sentimiento_por_año(año)
print(f'Reseñas positivas en {año}: {positivas}')
print(f'Reseñas neutras en {año}: {neutras}')
print(f'Reseñas negativas en {año}: {negativas}')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\niko\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Reseñas positivas en 2014: 14252
Reseñas neutras en 2014: 4538
Reseñas negativas en 2014: 3321
