In [12]:
%%capture
!pip install lazypredict
!pip install plotly_express
!pip install category_encoders

In [13]:
#%%writefile C:/Users/gcruz_li35hm9/Desktop/Bootcamp UDD Ciencia de Datos/Proyecto 7/Categorizador/modelo_predictivo.py

from pandas import DataFrame, read_csv, to_numeric, read_pickle
from numpy import concatenate, nan
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle
from google.colab import files 



class ModeloPredictivo:
    def __init__(self, df=None):
        self.df = df
        self.df_google_sin_duplicados = None
        self.df_x_train_transformed = None
        self.df_x_test_transformed = None
        self.y_train = None
        self.y_test = None
        self.model = None
        self.numerical_scaler = None
        self.object_scaler = None

    def eliminar_duplicados(self):
        self.df.drop(10472, inplace=True)  # Eliminar la fila 10472
        self.df_google_sin_duplicados = self.df.drop_duplicates().reset_index(drop=True)

    def convertir_reviews(self):
        self.df_google_sin_duplicados['Reviews'] = self.df_google_sin_duplicados['Reviews'].str.replace(',', '')
        self.df_google_sin_duplicados['Reviews'] = to_numeric(self.df_google_sin_duplicados['Reviews'], errors='coerce')

    @staticmethod
    def convert_size(size):
        if isinstance(size, str):
            size = size.strip()
            if size.endswith('M'):
                return float(size[:-1]) * 1_000
            elif size.endswith('k'):
                return float(size[:-1])
            elif size == 'Varies with device':
                return nan
            else:
                try:
                    return float(size)
                except ValueError:
                    return nan
        return nan

    def convertir_tamano(self):
        self.df_google_sin_duplicados['Size'] = self.df_google_sin_duplicados['Size'].apply(self.convert_size).astype(float)

    @staticmethod
    def convert_installs(installs):
        if isinstance(installs, str):
            installs_clean = installs.replace(',', '').replace('+', '')
            if installs_clean.isdigit():
                return int(installs_clean)
        return nan

    def convertir_installs(self):
        self.df_google_sin_duplicados['Installs'] = self.df_google_sin_duplicados['Installs'].apply(self.convert_installs).astype(float)

    def imputar_nans(self):
        self.df_google_sin_duplicados["Rating"] = self.df_google_sin_duplicados["Rating"].fillna(self.df_google_sin_duplicados["Rating"].median())
        self.df_google_sin_duplicados["Size"] = self.df_google_sin_duplicados["Size"].fillna(self.df_google_sin_duplicados["Size"].mean())
        self.df_google_sin_duplicados["Type"] = self.df_google_sin_duplicados["Type"].fillna("Free")
        self.df_google_sin_duplicados["Current Ver"] = self.df_google_sin_duplicados["Current Ver"].fillna("Varies with device")
        self.df_google_sin_duplicados["Android Ver"] = self.df_google_sin_duplicados["Android Ver"].fillna("4.1 and up")

    @staticmethod
    def rango_intercuartilico(columna):
        Q1 = columna.quantile(0.25)
        Q3 = columna.quantile(0.75)
        IQR = Q3 - Q1
        valor_min = Q1 - (1.5 * IQR)
        valor_max = Q3 + (1.5 * IQR)
        return valor_min, valor_max

    def imputar_outliers(self, columns):
        for col in columns:
            inf, sup = self.rango_intercuartilico(self.df_google_sin_duplicados[col])
            self.df_google_sin_duplicados[col] = self.df_google_sin_duplicados[col].apply(lambda x: inf if x < inf else sup if x > sup else x)

    def preparar_datos(self):
        y = self.df_google_sin_duplicados[['Rating']]
        x = self.df_google_sin_duplicados.drop(columns="Rating", axis=1)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, shuffle=True)

        numeric_cols = x.select_dtypes(include=["float64", "int64"])
        categorical_cols = x.select_dtypes(include="object")

        self.object_scaler = TargetEncoder(cols=categorical_cols.columns)
        self.numerical_scaler = StandardScaler()

        x_train_num = x_train.select_dtypes(include=["float64", "int64"])
        x_test_num = x_test.select_dtypes(include=["float64", "int64"])
        x_train_obj = x_train.select_dtypes(include=["object"])
        x_test_obj = x_test.select_dtypes(include=["object"])

        x_train_num_transformed = self.numerical_scaler.fit_transform(x_train_num, y_train)
        x_train_obj_transformed = self.object_scaler.fit_transform(x_train_obj, y_train)

        x_test_num_transformed = self.numerical_scaler.transform(x_test_num)
        x_test_obj_transformed = self.object_scaler.transform(x_test_obj)

        x_test_transformed = concatenate((x_test_num_transformed, x_test_obj_transformed), axis=1)
        x_train_transformed = concatenate((x_train_num_transformed, x_train_obj_transformed), axis=1)

        self.df_x_train_transformed = DataFrame(x_train_transformed, columns=x.columns)
        self.df_x_test_transformed = DataFrame(x_test_transformed, columns=x.columns)
        self.y_train = y_train
        self.y_test = y_test

    def entrenar_modelo(self):
        self.model = LinearRegression().fit(self.df_x_train_transformed, self.y_train)

    def predecir(self):
        return self.model.predict(self.df_x_test_transformed)

    def guardar_modelo(self, modelo_path, scaler_path):
        with open(modelo_path, 'wb') as modelo_file:
            pickle.dump(self.model, modelo_file)
        with open(scaler_path, 'wb') as scaler_file:
            pickle.dump((self.numerical_scaler, self.object_scaler), scaler_file)

    @staticmethod
    def cargar_modelo(modelo_path, scaler_path, df=None):
        modelo_predictivo = ModeloPredictivo(df)
        with open(modelo_path, 'rb') as modelo_file:
            modelo_predictivo.model = pickle.load(modelo_file)
        with open(scaler_path, 'rb') as scaler_file:
            modelo_predictivo.numerical_scaler, modelo_predictivo.object_scaler = pickle.load(scaler_file)
        return modelo_predictivo

    def ejecutar(self):
        self.eliminar_duplicados()
        self.convertir_reviews()
        self.convertir_tamano()
        self.convertir_installs()
        self.imputar_nans()
        self.imputar_outliers(['Reviews', 'Size', 'Installs'])
        self.preparar_datos()
        self.entrenar_modelo()
        return self.predecir()


# Uso de la clase

# df_google = pd.read_csv("ruta_a_tu_csv.csv")  # Cargar tus datos
# modelo = ModeloPredictivo(df_google)
# predicciones = modelo.ejecutar()
# modelo.guardar_modelo('modelo_predictivo.pkl')

# Para cargar el modelo y usarlo nuevamente
# modelo_cargado = ModeloPredictivo.cargar_modelo('modelo_predictivo.pkl')
# predicciones = modelo_cargado.predecir()
# print(predicciones)


In [None]:
# Guardar el DataFrame en un archivo Pickle
df_x_train_transformed.to_pickle('df_x_train_transformed.pkl')
# Descargar el archivo Pickle
from google.colab import files
files.download('df_x_train_transformed.pkl')

# Guardar el DataFrame en un archivo Pickle
df_x_test_transformed.to_pickle('df_x_test_transformed.pkl')
# Descargar el archivo Pickle
from google.colab import files
files.download('df_x_test_transformed.pkl')

# Guardar el DataFrame en un archivo Pickle
y_train.to_pickle('y_train.pkl')
# Descargar el archivo Pickle
from google.colab import files
files.download('y_train.pkl')

# Guardar el DataFrame en un archivo Pickle
y_test.to_pickle('y_test.pkl')
# Descargar el archivo Pickle
from google.colab import files
files.download('y_test.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>