In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

import plotly.express as px


In [None]:
df = pd.read_csv('../data/clean/video_game_sales_final_cleaned.csv')


In [None]:
df.info()
df.describe()
df.head()
df.isnull().sum()


In [None]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=['int64', 'float64'])

# Compute the correlation matrix
correlation_matrix = numeric_df.corr()

# Show correlation with global_sales as a simple sorted table
cor_target = correlation_matrix['global_sales'].sort_values(ascending=False).to_frame()

# Display table
cor_target.style.background_gradient(cmap='YlGnBu').format("{:.2f}")


In [None]:
X = df[[
    'platform',
    'genre',
    'publisher',
    'year_of_release'
]]
y = df['global_sales']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
categorical_features = ['platform', 'genre', 'publisher']
numerical_features = ['year_of_release']


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ]
)


In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 5. Entrenamiento del modelo
model.fit(X_train, y_train)

# 6. Predicción y evaluación
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ R² Score: {r2:.2f}")


In [None]:
from sklearn.ensemble import RandomForestRegressor

# 1. Crear un nuevo modelo con Random Forest
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 2. Entrenar el modelo
rf_model.fit(X_train, y_train)

# 3. Predecir y evaluar
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = mse_rf ** 0.5
r2_rf = r2_score(y_test, y_pred_rf)

# 4. Mostrar resultados
print(f"🌲 Random Forest RMSE: {rmse_rf:.2f}")
print(f"🌲 Random Forest R² Score: {r2_rf:.2f}")
