In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. Carregar o Dataset
try:
    df = pd.read_csv("Steam_2024_bestRevenue_1500.csv")  # Nome correto do arquivo
except FileNotFoundError:
    print("Arquivo não encontrado. Certifique-se de que 'Steam_2024_bestRevenue_1500.csv' está no mesmo diretório.")
    exit()

# 2. Limpeza e Pré-processamento

# Converter 'releaseDate' para datetime e extrair o ano
df['releaseDate'] = pd.to_datetime(df['releaseDate'], errors='coerce')
df['Release Year'] = df['releaseDate'].dt.year

# Converter publisherClass para string
df['publisherClass'] = df['publisherClass'].astype(str)

# Selecionar colunas relevantes e criar/transformar features
df = df[['name', 'revenue', 'price', 'copiesSold', 'avgPlaytime', 'reviewScore', 'publisherClass','Release Year']] # 'steamId' pode ser útil como ID, mas não como feature
df.rename(columns={'revenue': 'Revenue'}, inplace=True) # Padronizar nome da coluna alvo


# 3. Tratar valores faltantes (imputação com a mediana para numéricas, mais frequente para categóricas)

numerical_features = ['price', 'copiesSold', 'avgPlaytime', 'reviewScore', 'Release Year']
categorical_features = ['publisherClass']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# 4. Dividir em Treino e Teste (antes do pré-processamento final)
X = df.drop('Revenue', axis=1)
y = df['Revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 5. Pré-processamento com ColumnTransformer (após a divisão treino/teste)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 6. Pipeline do Modelo
model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(random_state=42))])

# 7. Treinar o Modelo
model.fit(X_train, y_train)


# 8. Avaliar o Modelo
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

RMSE: 1177379.5072550725
R-squared: 0.9973243302364824


RMSE: 1177379.5072550725
R-squared: 0.9973243302364824

In [2]:
y_train_pred = model.predict(X_train)

print("Dados de Treinamento (Amostra):")
train_comparison = pd.DataFrame({'Real': y_train, 'Previsto': y_train_pred})
print(train_comparison.sample(10))  # Imprime 10 amostras print("\nDados de Teste (Amostra):")
test_comparison = pd.DataFrame({'Real': y_test, 'Previsto': y_pred})
print(test_comparison.sample(10))

Dados de Treinamento (Amostra):
           Real      Previsto
1275   623473.0  6.356204e+05
357     92662.0  8.830191e+04
498     65249.0  6.287006e+04
810     38560.0  3.856990e+04
569   2547970.0  2.506361e+06
445     72453.0  9.862705e+04
1072    21703.0  2.288240e+04
749     42896.0  6.278959e+05
1396   328150.0  3.356028e+05
1151  1269633.0  1.555629e+06
           Real      Previsto
596   1913709.0  2.314221e+06
618     57371.0  5.388385e+04
231    134579.0  1.160818e+05
1085    21168.0  2.382729e+04
430     74140.0  7.310437e+04
941     28483.0  2.536238e+04
1084    21260.0  2.361618e+04
1231   787339.0  7.291660e+05
1448   264829.0  2.360493e+05
1312   478599.0  4.012174e+05


In [3]:
print("\nDados de Teste (Amostra):")
test_comparison = pd.DataFrame({'Real': y_test, 'Previsto': y_pred})
print(test_comparison.sample(10))


Dados de Teste (Amostra):
            Real      Previsto
1065     22014.0  2.691105e+04
930      29094.0  2.878304e+04
1419    301311.0  2.745002e+05
1425    290153.0  4.548544e+05
432      73926.0  5.996273e+04
481      67473.0  5.848962e+04
1466  34530561.0  4.849152e+07
81      230628.0  2.336242e+05
1050     22581.0  2.301338e+04
1284    597802.0  5.431338e+05
