In [None]:
import pandas as pd
steam_requirements_data = pd.read_csv('steam_requirements_data.csv', encoding='latin-1')
steamspy_tag_data = pd.read_csv('steamspy_tag_data.csv')
steam_data = pd.read_csv('steam.csv')

datasets_overview = {
    "steam_requirements_data": steam_requirements_data.head(),
    "steamspy_tag_data": steamspy_tag_data.head(),
    "steam_data": steam_data.head()
}
datasets_overview

In [None]:
column_names = {
    "steam_data": steam_data.columns.tolist(),
    "steamspy_tag_data": steamspy_tag_data.columns.tolist(),
    "steam_requirements_data": steam_requirements_data.columns.tolist(),
}
column_names

In [None]:
merged_df = pd.merge(
    steam_data,
    steamspy_tag_data,
    on='appid',
    how='left',
    suffixes=('', '_tags')
)

# Second merge: add requirements data
final_merged_df = pd.merge(
    merged_df,
    steam_requirements_data,
    on='appid',
    how='left',
    suffixes=('', '_req')
)

# Check for any missing values after merge
missing_values = final_merged_df.isnull().sum()
print("\nMissing values in merged dataset:")
print(missing_values[missing_values > 0])

# Save the merged dataset
final_merged_df.to_csv('merged_steam_data.csv', index=False)
print("\nMerged dataset has been saved as 'merged_steam_data.csv'")

# Print final dataset size
print(f"\nFinal merged dataset rows: {len(final_merged_df)}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from datetime import datetime

# Load the merged dataset
df = pd.read_csv('merged_steam_data.csv', encoding='latin-1')

# Data Preprocessing
def prepare_features(df):
    # Convert release_date to datetime features
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['release_year'] = df['release_date'].dt.year
    df['release_month'] = df['release_date'].dt.month

    # Create price categories
    df['price_category'] = pd.qcut(df['price'], q=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])

    # Calculate total ratings
    df['total_ratings'] = df['positive_ratings'] + df['negative_ratings']
    df['rating_ratio'] = df['positive_ratings'] / df['total_ratings']

    # Select features for prediction
    features = [
        'price', 'release_year', 'release_month', 'required_age',
        'achievements', 'average_playtime', 'median_playtime',
        'rating_ratio', 'total_ratings'
    ]

    # Add categorical features
    categorical_features = ['developer', 'publisher', 'price_category']

    return df[features + categorical_features]

# Prepare the data
X = prepare_features(df)
y = df['positive_ratings']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing pipelines
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [None]:
# 1. Random Forest Model
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
rf_pred = rf_pipeline.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

In [None]:
# 2. Multiple Linear Regression
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lr_pipeline.fit(X_train, y_train)
lr_pred = lr_pipeline.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))

In [None]:
# 3. Neural Network (Using MLPRegressor for simplicity)
nn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42))
])

nn_pipeline.fit(X_train, y_train)
nn_pred = nn_pipeline.predict(X_test)
nn_r2 = r2_score(y_test, nn_pred)
nn_rmse = np.sqrt(mean_squared_error(y_test, nn_pred))

In [None]:
# Print results
print("\nModel Performance Comparison:")
print("-" * 50)
print(f"Random Forest R² Score: {rf_r2:.4f}")
print(f"Random Forest RMSE: {rf_rmse:.2f}")
print("-" * 50)
print(f"Linear Regression R² Score: {lr_r2:.4f}")
print(f"Linear Regression RMSE: {lr_rmse:.2f}")
print("-" * 50)
print(f"Neural Network R² Score: {nn_r2:.4f}")
print(f"Neural Network RMSE: {nn_rmse:.2f}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
models = ['Random Forest', 'Linear Regression', 'Neural Network']

# R² scores and RMSE values
r2_scores = [rf_r2, lr_r2, nn_r2]
rmse_values = [rf_rmse, lr_rmse, nn_rmse]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))  # 1 row, 2 columns

# Plot R² scores on the first subplot
ax1.bar(models, r2_scores, color=['blue', 'green', 'red'])
ax1.set_title('Model Performance Comparison: R² Score')
ax1.set_ylabel('R² Score')
ax1.set_ylim(0, 1)  # Set y-axis limits for R² score (0 to 1)

# Plot RMSE values on the second subplot
ax2.bar(models, rmse_values, color=['blue', 'green', 'red'])
ax2.set_title('Model Performance Comparison: RMSE')
ax2.set_ylabel('RMSE')

# Adjust layout
plt.tight_layout()

# Display the chart
plt.show()
#finish