## Proyecto Migración API

## Importación `Libererías`

In [1]:
import datetime

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import joblib

import nltk
nltk.download('vader_lexicon')
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.sentiment import SentimentIntensityAnalyzer
from pydantic import BaseModel
from typing import List, Union, Tuple , Optional
from fastapi import FastAPI, HTTPException, Depends, Query
# Assuming you have a pre-trained sentiment analysis model
sentiment_analyzer_model = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ozi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Testing `X_train`, post `ETL`

In [2]:
X_train = pd.read_csv("../datasets/processed/X_train.csv")
y_train = pd.read_csv("../datasets/processed/y_train.csv")

In [3]:
concat_df = pd.concat([X_train, y_train], axis=1)

In [6]:
concat_df.shape

(28605, 18)

In [7]:
sample_df = concat_df.sample(n=10000, random_state=42)

In [8]:
sample_df.to_csv('../server/data_train.csv')

In [41]:
concat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38140 entries, 0 to 38139
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genres_str            38140 non-null  object 
 1   app_name              38140 non-null  object 
 2   release_date          38140 non-null  object 
 3   price                 38140 non-null  float64
 4   game_id               38140 non-null  int64  
 5   developer             38140 non-null  object 
 6   year                  38140 non-null  int64  
 7   user_id               38140 non-null  object 
 8   user_reviews_item_id  38140 non-null  int64  
 9   review                38140 non-null  object 
 10  posted_date           38140 non-null  object 
 11  items_count           38140 non-null  int64  
 12  steam_id              38140 non-null  int64  
 13  users_items_item_id   38140 non-null  int64  
 14  item_name             38140 non-null  object 
 15  playtime_forever   

In [42]:
concat_df.head(5)

Unnamed: 0,genres_str,app_name,release_date,price,game_id,developer,year,user_id,user_reviews_item_id,review,posted_date,items_count,steam_id,users_items_item_id,item_name,playtime_forever,playtime_2weeks,recommend
0,rpg,ORION: Prelude,2013-04-16,0.99,104900,"Trek Industries, Inc",2013,SPACEgamer,104900,Bascially left 4 dead 2 with dinos. The funnie...,2014-06-11,868,76561197971763633,104900,ORION: Prelude,3,0,1
1,strategy,Knights of Honor,2005-05-10,9.99,25830,Black Sea Studios Ltd,2005,NUZXD,25830,"Played as medieval Denmark, Got destroyed by N...",2014-11-28,336,76561198061406190,25830,Knights of Honor,0,0,1
2,massivelymultiplayer,PlanetSide 2,2012-11-20,-1.0,218230,Daybreak Game Company,2012,greenie1995,218230,Game.Of. The Year,2012-11-22,21,76561198059259810,218230,PlanetSide 2,0,0,1
3,action,Battlefield: Bad Company™ 2,2010-03-02,19.99,24960,DICE,2010,silentAggressor,24960,"""I really like the game, but I couldn t find t...",2011-12-28,66,76561198059260834,24960,Battlefield: Bad Company 2,252,0,1
4,rpg,Terraria,2011-05-16,9.99,105600,Re-Logic,2011,FeatInk,105600,Amazing game.Perfect for anyone bored of minec...,2014-02-20,33,76561198059257958,105600,Terraria,3950,0,1


## Testing API ML `SentimentAnalysisProcessor`

In [43]:
class SentimentAnalysisProcessor:
    def __init__(self, threshold_low=-0.5, threshold_high=0.5):
        self.threshold_low = threshold_low
        self.threshold_high = threshold_high
        self.analyzer = SentimentIntensityAnalyzer()

    def process_data(self, input_df):
        # Crear una copia del DataFrame de entrada para evitar modificaciones no deseadas
        allowed_genres = ['strategy', 'indie', 'rpg', 'action', 'simulation', 'adventure']
        df_a = input_df[input_df['genres_str'].isin(allowed_genres)]
        df = df_a.copy()

        # Aplicar el análisis de sentimientos a la columna 'review'
        df['scores_review'] = df['review'].apply(lambda x: self.analyzer.polarity_scores(x)['compound'])

        # Crear la nueva columna 'score_new' según las bandas definidas
        df['score_new'] = df['scores_review'].apply(lambda x: 0 if x < self.threshold_low else (2 if x > self.threshold_high else 1))

        # Clasificar las reseñas numéricamente
        df['sentiment_numeric'] = df['score_new']

        # Clasificar las reseñas como "Positive", "Negative" o "Neutral"
        df['sentiment_more_less'] = df['score_new'].apply(lambda x: 'Positive' if x == 2 else ('Negative' if x == 0 else 'Neutral'))

        # Eliminar las columnas 'review' y 'scores_review' si es necesario
        columns_to_drop = ['review', 'scores_review', 'release_date', 'posted_date', 'item_name', 'sentiment_numeric', 'game_id', 'steam_id']
        df = df.drop(columns=columns_to_drop)
        df['users_items_item_id'] = df['users_items_item_id'].astype('object')
        df['user_id'] = df['user_id'].astype('object')
        return df

# Crear una instancia de la clase
sentiment_processor = SentimentAnalysisProcessor()
# Pasar el DataFrame a la clase para el procesamiento
output_df = sentiment_processor.process_data(concat_df)


In [44]:
output_df.head(5)

Unnamed: 0,genres_str,app_name,price,developer,year,user_id,user_reviews_item_id,items_count,users_items_item_id,playtime_forever,playtime_2weeks,recommend,score_new,sentiment_more_less
0,rpg,ORION: Prelude,0.99,"Trek Industries, Inc",2013,SPACEgamer,104900,868,104900,3,0,1,0,Negative
1,strategy,Knights of Honor,9.99,Black Sea Studios Ltd,2005,NUZXD,25830,336,25830,0,0,1,1,Neutral
3,action,Battlefield: Bad Company™ 2,19.99,DICE,2010,silentAggressor,24960,66,24960,252,0,1,0,Negative
4,rpg,Terraria,9.99,Re-Logic,2011,FeatInk,105600,33,105600,3950,0,1,2,Positive
5,strategy,Medieval II: Total War™,19.99,"The Creative Assembly,Feral Interactive (Mac),...",2006,5400540054005400,4700,100,4700,0,0,1,2,Positive


In [45]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31080 entries, 0 to 38139
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genres_str            31080 non-null  object 
 1   app_name              31080 non-null  object 
 2   price                 31080 non-null  float64
 3   developer             31080 non-null  object 
 4   year                  31080 non-null  int64  
 5   user_id               31080 non-null  object 
 6   user_reviews_item_id  31080 non-null  int64  
 7   items_count           31080 non-null  int64  
 8   users_items_item_id   31080 non-null  object 
 9   playtime_forever      31080 non-null  int64  
 10  playtime_2weeks       31080 non-null  int64  
 11  recommend             31080 non-null  int64  
 12  score_new             31080 non-null  int64  
 13  sentiment_more_less   31080 non-null  object 
dtypes: float64(1), int64(7), object(6)
memory usage: 3.6+ MB


## Testing API `Métricas`

In [46]:
from pydantic import BaseModel

# Pydantic Información de Juego
class GameInfo(BaseModel):
    genres_str: str
    app_name: str
    price: float
    game_id: int
    developer: str
    year: int
    recommend: int
    items_count: int
    user_id: int
    playtime_forever: int
    playtime_2weeks: int
    score_new: int
    sentiment_more_less: str

# Pydantic para parámetros y resultados desarrollador
class DeveloperParams(BaseModel):
    dev: str

class DeveloperResult(BaseModel):
    Año: int
    Cantidad_de_Items: int
    Contenido_Free: str


# Definir función para obtener información del desarrollador
def get_developer_info(params: DeveloperParams = Depends()):
    try:
        # Filtrar el DataFrame para obtener solo las filas del desarrollador especificado
        dev_df = output_df[output_df['developer'] == params.dev]

        if not dev_df.empty:
            # Agrupar por año y sumar la cantidad de items
            grouped = dev_df.groupby('year')['items_count'].sum().reset_index()

            result = []

            for row in grouped.itertuples(index=False):
                # Filtrar solo items de este desarrollador y año
                year_dev_df = dev_df[(dev_df['developer'] == params.dev) & (dev_df['year'] == row.year)]

                # Contar items gratuitos (precio <= 0)
                free_items = year_dev_df[year_dev_df['price'] <= 0]['items_count'].sum()

                # Calcular % de items gratuitos
                pct_free = (free_items / row.items_count) * 100 if row.items_count > 0 else 0

                # Redondear % a 2 decimales
                pct_free = round(pct_free, 2)

                result.append(DeveloperResult(
                    Año=int(row.year),
                    Cantidad_de_Items=int(row.items_count),
                    Contenido_Free=f"{pct_free}%"
                ))

            return result

        else:
            # Devolver una respuesta de error si no se encuentra información para el desarrollador
            raise HTTPException(
                status_code=404,
                detail=f"No se encontró información del desarrollador '{params.dev}'"
            )

    except Exception as e:
        # Manejar cualquier excepción y devolver una respuesta de error
        raise HTTPException(
            status_code=500,
            detail=f"Error interno: {str(e)}"
        )


def get_developer_info_endpoint(
    dev: str = "Valve",  # Puedes probar con diferentes valores cambiando este parámetro
    result: List[DeveloperResult] = Depends(get_developer_info)
):

    return result


# Pydantic Estadísticas de usuario
class UserStats(BaseModel):
    Usuario: str
    Dinero_gastado: str
    Porcentaje_recomendacion: str
    Cantidad_items: int

# Pydantic Parámetros de SteamId
class SteamIdParams(BaseModel):
    user_id: str

def convert_to_python_types(record):
    record_dict = record.to_dict(orient='records')[0]
    
    for key, value in record_dict.items():
        if isinstance(value, (np.int64, np.int32, np.float64, np.float32)):
            record_dict[key] = pd.to_numeric(value)  # Convertir a tipo nativo de Python
        elif isinstance(value, pd.Timestamp):
            record_dict[key] = value.strftime('%Y-%m-%dT%H:%M:%S.%fZ')  # Formatear fechas
        else:
            record_dict[key] = value
    
    return record_dict


def get_user_stats(params: SteamIdParams = Depends()):
    try:
        # Convertir 'params.user_id' a cadena para asegurar la comparación
        user_df = output_df[output_df['user_id'].astype(str) == params.user_id]

        # Verificar si hay datos para el usuario
        if user_df.empty:
            raise HTTPException(status_code=404, detail="No se encontraron datos para este usuario.")

        # Calcular el dinero gastado (suma de price * items_count)
        money_spent = (user_df['price'] * user_df['items_count']).sum()

        # Sumar las recomendaciones donde recommend es igual a 1
        total_recommendations = user_df[user_df['recommend'] == 1]['items_count'].sum()

        # Sumar la cantidad total de items
        total_items = user_df['items_count'].sum()

        # Calcular el porcentaje de recomendación basado en las sumas
        pct_recommended = (total_recommendations / total_items) * 100 if total_items > 0 else 0

        # Formatear el porcentaje de recomendación
        pct_recommended_str = '{:.2f}%'.format(pct_recommended)

        # Crear y devolver el objeto Pydantic con la información
        return UserStats(
            Usuario=params.user_id,
            Dinero_gastado='${:.2f} USD'.format(money_spent),
            Porcentaje_recomendacion=pct_recommended_str,
            Cantidad_items=total_items,
        )

    except Exception as e:
        # Manejar cualquier excepción y devolver una respuesta de error
        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")

# 76561197971591953	Counter-Strike

def get_user_stats_endpoint(stats: UserStats = Depends(get_user_stats)):
    return stats


# Definir modelo Pydantic para parámetros de género
class GenreParams(BaseModel):
    genre: str

# Definir función para el nuevo endpoint
def get_user_playtime_by_genre(params: GenreParams = Depends()):
    try:
        # Filtrar el DataFrame para el género específico
        genre_df = output_df[output_df['genres_str'] == params.genre].copy()

        # Verificar si hay datos para el género
        if genre_df.empty:
            raise HTTPException(status_code=404, detail=f"No se encontraron datos para el género {params.genre}.")

        # Encontrar el usuario con más horas jugadas
        max_playtime_user = genre_df.loc[genre_df['playtime_forever'].idxmax()]['user_id']

        # Convertir las horas de minutos a horas y redondear a números enteros
        genre_df['playtime_hours'] = genre_df['playtime_forever'] // 60

        # Calcular la acumulación de horas jugadas por año
        playtime_by_year = genre_df.groupby('year')['playtime_hours'].sum().reset_index()
        playtime_list = [{"Año": int(year), "Horas": int(hours)} for year, hours in zip(playtime_by_year['year'], playtime_by_year['playtime_hours'])]

        # Crear y devolver el diccionario con la información
        return {
            f"Usuario con más horas jugadas para {params.genre}": max_playtime_user,
            "Horas jugadas": playtime_list,
        }

    except Exception as e:
        # Manejar cualquier excepción y devolver una respuesta de error
        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")

# Agregar el nuevo endpoint a la aplicación
def user_playtime_by_genre_endpoint(result: dict = Depends(get_user_playtime_by_genre)):
    return result


# Definir modelo Pydantic para parámetros de año
class YearParams(BaseModel):
    year: int

# Definir función para el nuevo endpoint
def get_top_developers_by_year(params: YearParams = Depends()):
    try:
        # Filtrar el DataFrame para el año específico
        year_df = output_df[output_df['year'] == params.year]

        # Verificar si hay datos para el año
        if year_df.empty:
            raise HTTPException(status_code=404, detail=f"No se encontraron datos para el año {params.year}.")

        # Filtrar por juegos recomendados y con comentarios positivos
        recommended_df = year_df[(year_df['recommend'] == 1) & (year_df['score_new'] > 0)]

        # Agrupar por desarrollador y contar la cantidad de juegos recomendados
        developer_counts = recommended_df.groupby('developer')['recommend'].sum().reset_index()

        # Ordenar en orden descendente y obtener el top 3
        top_developers = developer_counts.sort_values(by='recommend', ascending=False).head(3)

        # Crear la lista de resultados en el formato deseado
        result_list = [{"Puesto {}: {}".format(i + 1, row['developer']): row['recommend']} for i, row in top_developers.iterrows()]

        # Devolver la lista de resultados
        return result_list

    except Exception as e:
        # Manejar cualquier excepción y devolver una respuesta de error
        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")

# Agregar el nuevo endpoint a la aplicación

def top_developers_by_year_endpoint(result: List[dict] = Depends(get_top_developers_by_year)):
    return result


# Definir modelo Pydantic para parámetros de desarrollador
class DeveloperParams(BaseModel):
    dev: str

# Definir función para el nuevo endpoint
def get_developer_reviews_analysis(params: DeveloperParams = Depends()):
    try:
        # Filtrar el DataFrame para el desarrollador específico
        developer_df = output_df[output_df['developer'] == params.dev].copy()

        # Verificar si hay datos para el desarrollador
        if developer_df.empty:
            raise HTTPException(status_code=404, detail=f"No se encontraron datos para el desarrollador {params.dev}.")

        # Convertir tipos de datos de NumPy a tipos nativos de Python
        developer_df['sentiment_more_less'] = developer_df['sentiment_more_less'].astype(str)

        # Contar la cantidad de reseñas positivas, negativas y neutrales
        pos = int((developer_df['sentiment_more_less'] == 'Positive').sum())
        neg = int((developer_df['sentiment_more_less'] == 'Negative').sum())
        neu = int((developer_df['sentiment_more_less'] == 'Neutral').sum())

        # Crear el diccionario de resultados en el formato deseado
        results = {params.dev: [{'Positive': pos}, {'Negative': neg}, {'Neutral': neu}]}

        # Devolver el diccionario de resultados
        return results

    except Exception as e:
        # Manejar cualquier excepción y devolver una respuesta de error
        raise HTTPException(status_code=500, detail=f"Error interno: {str(e)}")
    
# Agregar el nuevo endpoint a la aplicación
def developer_reviews_analysis_endpoint(result: Union[dict, List[dict]] = Depends(get_developer_reviews_analysis)):
    return result

    

# TESTEO
# Prueba de get_developer_info
dev_params = DeveloperParams(dev="Valve")
print(get_developer_info(dev_params))
 
# Prueba de get_user_stats
steam_params = SteamIdParams(user_id="SPACEgamer")
print(get_user_stats(params=steam_params))

# Prueba de get_user_playtime_by_genre
genre_params = GenreParams(genre="simulation")
print(get_user_playtime_by_genre(params=genre_params))

# Prueba de get_top_developers_by_year
year_params = YearParams(year=2014)
print(get_top_developers_by_year(params=year_params))

# Prueba de get_developer_reviews_analysis
dev_params = DeveloperParams(dev="Valve")
print(get_developer_reviews_analysis(dev_params))


[DeveloperResult(Año=1998, Cantidad_de_Items=8379, Contenido_Free='0.0%'), DeveloperResult(Año=1999, Cantidad_de_Items=1881, Contenido_Free='0.0%'), DeveloperResult(Año=2000, Cantidad_de_Items=622, Contenido_Free='0.0%'), DeveloperResult(Año=2001, Cantidad_de_Items=9, Contenido_Free='0.0%'), DeveloperResult(Año=2003, Cantidad_de_Items=33, Contenido_Free='0.0%'), DeveloperResult(Año=2004, Cantidad_de_Items=16593, Contenido_Free='0.0%'), DeveloperResult(Año=2005, Cantidad_de_Items=252, Contenido_Free='100.0%'), DeveloperResult(Año=2006, Cantidad_de_Items=19868, Contenido_Free='0.0%'), DeveloperResult(Año=2007, Cantidad_de_Items=26957, Contenido_Free='0.0%'), DeveloperResult(Año=2008, Cantidad_de_Items=748, Contenido_Free='0.0%'), DeveloperResult(Año=2009, Cantidad_de_Items=60700, Contenido_Free='0.0%'), DeveloperResult(Año=2010, Cantidad_de_Items=588, Contenido_Free='0.0%'), DeveloperResult(Año=2011, Cantidad_de_Items=5943, Contenido_Free='0.0%'), DeveloperResult(Año=2012, Cantidad_de_It

## Testing API ML `cosine_similarity`

### LabelEncoder y HotEncoder a Varibles por Análisis

In [47]:
# Definir la clase MultiColumnLabelEncoder
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
        return self

    def transform(self, X):
        for col in self.columns:
            le = self.label_encoders[col]
            X.loc[:, col] = le.transform(X[col])
        return X

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.numeric_columns = []
        self.ordinal_columns = []
        self.categorical_columns = []

    def fit(self, X, y=None):
        # Obtener las columnas numéricas, ordinales y categóricas
        self.numeric_columns = X.select_dtypes(include=['int64', 'float64', 'int32']).columns
        self.ordinal_columns = ['recommend']  # Puedes añadir aquí tus columnas ordinales
        self.categorical_columns = X.select_dtypes(include=['object']).columns
        return self

    def transform(self, X):
        # Crear una nueva variable excluyendo las columnas ordinales
        categorical_col_excluded_ordinal = [col for col in self.categorical_columns if col not in self.ordinal_columns]
        numeric_col_excluded_ordinal = [col for col in self.numeric_columns if col not in self.ordinal_columns]

        # Convertir las columnas categóricas a tipo str
        X.loc[:, categorical_col_excluded_ordinal] = X[categorical_col_excluded_ordinal].astype(str)

        # Definir las transformaciones para las columnas numéricas
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        # Definir las transformaciones para las columnas ordinales
        ordinal_transformer = MultiColumnLabelEncoder(columns=self.ordinal_columns)

        # Aplicar las transformaciones
        transformed_data = numeric_transformer.fit_transform(X[numeric_col_excluded_ordinal])
        transformed_data = np.concatenate([transformed_data, ordinal_transformer.fit_transform(X[self.ordinal_columns])], axis=1)

        # Obtener los nombres de las columnas después de la transformación
        numeric_feature_names = numeric_transformer.named_steps['scaler'].get_feature_names_out(input_features=numeric_col_excluded_ordinal)
        ordinal_feature_names = self.ordinal_columns
        column_names = np.concatenate([numeric_feature_names, ordinal_feature_names])

        return transformed_data, column_names



def process_dataframe(input_df: pd.DataFrame, columns_to_drop: list) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    # Create an instance of the class
    label_processor = MultiColumnLabelEncoder() 
    custom_processor = CustomPreprocessor()

    # Drop specified columns
    processed_df = input_df.drop(columns=columns_to_drop)

    # Apply the pipeline to the DataFrame
    transformed_data, column_names = custom_processor.fit_transform(processed_df)

    return processed_df, pd.DataFrame(transformed_data, columns=column_names)

# Example usage
columns_to_drop = ['app_name', 'developer', 'user_id','user_reviews_item_id', 'users_items_item_id','sentiment_more_less']
processed_df, transformed_df = process_dataframe(output_df, columns_to_drop)


In [48]:
transformed_df.columns

Index(['price', 'year', 'items_count', 'playtime_forever', 'playtime_2weeks',
       'score_new', 'recommend'],
      dtype='object')

In [49]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31080 entries, 0 to 38139
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genres_str            31080 non-null  object 
 1   app_name              31080 non-null  object 
 2   price                 31080 non-null  float64
 3   developer             31080 non-null  object 
 4   year                  31080 non-null  int64  
 5   user_id               31080 non-null  object 
 6   user_reviews_item_id  31080 non-null  int64  
 7   items_count           31080 non-null  int64  
 8   users_items_item_id   31080 non-null  object 
 9   playtime_forever      31080 non-null  int64  
 10  playtime_2weeks       31080 non-null  int64  
 11  recommend             31080 non-null  int64  
 12  score_new             31080 non-null  int64  
 13  sentiment_more_less   31080 non-null  object 
dtypes: float64(1), int64(7), object(6)
memory usage: 3.6+ MB


### Testing ML Ánalisis de Variables Escaladas

In [81]:
def calculate_similarity_matrix(df):
    # Seleccionar solo las columnas numéricas para el cálculo de similitud
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df_numeric = df[numeric_columns]

    # Calcular la matriz de similitud coseno
    similarity_matrix = cosine_similarity(df_numeric)

    return similarity_matrix

# Ejemplo de uso con tu DataFrame
output_df_ml = pd.concat([output_df.loc[:, ['users_items_item_id', 'app_name']], transformed_df], axis=1)
modelo_item_id = output_df_ml.groupby(["users_items_item_id", "app_name"]).sum().reset_index()
# Llamar a la función con el DataFrame con índices personalizados
similarity_matrix = calculate_similarity_matrix(modelo_item_id)

In [82]:
# Establecer 'users_items_item_id' como el índice

modelo_item_id.head(5) #user_reviews_item_id

Unnamed: 0,users_items_item_id,app_name,price,year,items_count,playtime_forever,playtime_2weeks,score_new,recommend
0,10,Counter-Strike,12.092383,5.066938,-8.070884,-5.385209,-4.531121,9.7695,42.0
1,20,Team Fortress Classic,0.620937,1.558531,2.381138,-3.067845,-0.79505,2.821889,7.0
2,30,Day of Defeat,-1.437599,-0.307789,2.997905,-0.795541,-0.142843,0.316381,2.0
3,40,Deathmatch Classic,-0.357613,-1.548739,-0.446814,-0.378145,-0.099381,-0.619992,1.0
4,50,Half-Life: Opposing Force,-0.087784,-1.349312,-1.019752,2.232226,-0.298144,1.252754,3.0


In [83]:
modelo_item_id.shape

(2196, 9)

In [84]:
num_unique_indices = modelo_item_id.users_items_item_id.unique()
num_unique_indices

array([    10,     20,     30, ..., 521570, 521990, 527340], dtype=int64)

### Testing Función para API

In [87]:
class SimilarGamesRequest(BaseModel):
    reference_item_id: int
    num_similar_games: int

class SimilarGamesResponse(BaseModel):
    similar_games: List[str]

def get_similar_games(request: SimilarGamesRequest, similarity_matrix: np.ndarray, output_df: pd.DataFrame) -> SimilarGamesResponse:
    # Buscar el índice correspondiente al users_items_item_id en el DataFrame
    reference_index = output_df[output_df['users_items_item_id'] == request.reference_item_id].index[0]
    
    # Obtener las puntuaciones de similitud para el juego de referencia
    similarity_scores = similarity_matrix[reference_index]

    # Obtener los índices ordenados por similitud
    sorted_indices = similarity_scores.argsort()[::-1]

    # Obtener los índices de juegos similares (excluyendo el juego de referencia)
    similar_games_indices = sorted_indices[1:request.num_similar_games + 1]

    # Obtener los nombres de juegos similares
    similar_games = output_df['app_name'].iloc[similar_games_indices]

    return SimilarGamesResponse(similar_games=similar_games.to_list())

# Crear una solicitud de juegos similares
reference_index = SimilarGamesRequest(reference_item_id=521430, num_similar_games=10)

# Obtener juegos similares
similar_games_result = get_similar_games(reference_index, similarity_matrix, modelo_item_id)
print(similar_games_result)


similar_games=['Project Highrise', 'Age of Empires II HD: The African Kingdoms', 'Wurm Unlimited', 'Scott in Space', 'Picross Touch', 'Galactic Storm', 'Fortified', 'A Study in Steampunk: Choice by Gaslight', 'Domestic Dog', 'Cuties']


In [88]:
def calculate_similarity_matrix(df):
    # Seleccionar solo las columnas numéricas para el cálculo de similitud
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df_numeric = df[numeric_columns]

    # Calcular la matriz de similitud coseno
    similarity_matrix = cosine_similarity(df_numeric)

    return similarity_matrix

# Ejemplo de uso con tu DataFrame
output_df_ml_2 = pd.concat([output_df.loc[:, ['user_id', 'app_name']], transformed_df], axis=1)
modelo_id_steam = output_df_ml_2.groupby(["user_id", "app_name"]).sum().reset_index()
similarity_matrix_steam = calculate_similarity_matrix(modelo_id_steam)

from typing import Union

class SimilarGamesRequest(BaseModel):
    reference_index: Union[str, int]
    num_similar_games: int

class SimilarGamesResponse(BaseModel):
    similar_games: List[str]

# Assuming that 'user_id' is the column in modelo_id_steam corresponding to the Steam IDs
reference_index = SimilarGamesRequest(reference_index="SPACEgamer", num_similar_games=10)

def get_similar_games(request: SimilarGamesRequest, similarity_matrix: np.ndarray, modelo_id_steam: pd.DataFrame) -> SimilarGamesResponse:
    # Convert the reference_index to string for case consistency
    str_reference_index = str(request.reference_index)

    # Find the index corresponding to the given Steam ID
    matching_rows = modelo_id_steam[
        modelo_id_steam['user_id'].astype(str).str.lower() == str_reference_index.lower()
    ]

    if matching_rows.empty:
        # Handle the case where no rows match the specified Steam ID
        return SimilarGamesResponse(similar_games=[])

    reference_index = matching_rows.index[0]

    # Get similarity scores for the reference game
    similarity_scores = similarity_matrix[reference_index]
    
    # Get indices ordered by similarity
    sorted_indices = similarity_scores.argsort()[::-1]
    
    # Get indices of similar games (excluding the reference game)
    similar_games_indices = sorted_indices[1:request.num_similar_games + 1]
    
    # Get names of similar games
    similar_games = modelo_id_steam['app_name'].iloc[similar_games_indices]
    
    return SimilarGamesResponse(similar_games=similar_games.to_list())

# Use the request with the Steam ID
similar_games_result = get_similar_games(reference_index, similarity_matrix_steam, modelo_id_steam)
print(similar_games_result)



similar_games=['Call of Duty: World at War', 'ORION: Prelude', 'Cargo Commander', 'Among the Sleep - Enhanced Edition', 'Counter-Strike: Global Offensive', 'Counter-Strike: Global Offensive', 'PAYDAY™ The Heist', 'Counter-Strike: Global Offensive', 'Hammerfight', 'Cry of Fear']
