C:\Users\tbont\Self_Anaconda


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load the dataset
data = pd.read_csv('Steam/steam.csv')

# Convert release_date to datetime and extract the year
data['release_date'] = pd.to_datetime(data['release_date'])
data['release_year'] = data['release_date'].dt.year

# One-hot encoding for platforms, categories, genres, and steamspy_tags
platforms_enc = pd.DataFrame(OneHotEncoder().fit_transform(data['platforms'].str.get_dummies(sep=';')).toarray())
platforms_enc.columns = [f'platform_{col}' for col in platforms_enc.columns]

categories_enc = pd.DataFrame(OneHotEncoder().fit_transform(data['categories'].str.get_dummies(sep=';')).toarray())
categories_enc.columns = [f'category_{col}' for col in categories_enc.columns]

genres_enc = pd.DataFrame(OneHotEncoder().fit_transform(data['genres'].str.get_dummies(sep=';')).toarray())
genres_enc.columns = [f'genre_{col}' for col in genres_enc.columns]

steamspy_tags_enc = pd.DataFrame(OneHotEncoder().fit_transform(data['steamspy_tags'].str.get_dummies(sep=';')).toarray())
steamspy_tags_enc.columns = [f'steamspy_tag_{col}' for col in steamspy_tags_enc.columns]

# Combine the one-hot encoded features with the original dataset
encoded_data = pd.concat([data.reset_index(drop=True), platforms_enc, categories_enc, genres_enc, steamspy_tags_enc], axis=1)

# Drop the original columns that have been one-hot encoded
encoded_data.drop(['platforms', 'categories', 'genres', 'steamspy_tags'], axis=1, inplace=True)

# Calculate the midpoint of the owners' range
owners_midpoint = encoded_data['owners'].str.split('-').apply(lambda x: (int(x[0]) + int(x[1])) / 2)
encoded_data['owners_midpoint'] = owners_midpoint

# Drop the original owners column
encoded_data.drop(['owners'], axis=1, inplace=True)

# Standardize numerical features
num_features = ['achievements', 'positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price']
scaler = StandardScaler()
encoded_data[num_features] = scaler.fit_transform(encoded_data[num_features])

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(encoded_data, test_size=0.3, random_state=42)

In [6]:
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# Select important features using RFE
X = encoded_data.drop(['appid', 'name', 'release_date', 'developer', 'publisher', 'owners_midpoint'], axis=1)
y = encoded_data['owners_midpoint']

estimator = RandomForestRegressor(random_state=42)
selector = RFE(estimator, n_features_to_select=10, step=1)
selector = selector.fit(X, y)

# Get the selected features
selected_features = X.columns[selector.support_]

# Split the dataset using only selected features
X_train = train_data[selected_features]
X_test = test_data[selected_features]
y_train = train_data['owners_midpoint']
y_test = test_data['owners_midpoint']

# Define the models
rf = RandomForestRegressor(random_state=42)
svm = SVR()
xgb = XGBRegressor(random_state=42)
vr = VotingRegressor(estimators=[('rf', rf), ('svm', svm), ('xgb', xgb)])

models = [rf, svm, xgb, vr]
model_names = ['Random Forest', 'SVM', 'XGBoost', 'Voting Regressor']

# Evaluate models using cross-validation and print results
for model, name in zip(models, model_names):
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f'{name} R2 score (cross-validation): {np.mean(cv_scores):.2f}')

# Fit the models on the training set and evaluate on the test set
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name} - Test set: MSE = {mse:.2f}, R2 score = {r2:.2f}')




KeyboardInterrupt: 