In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import os

In [2]:
pd.options.display.float_format = '{:20,.2f}'.format

In [3]:
steam_data2 = pd.read_csv('../data/steam_clean2.csv', parse_dates=['release_date'], index_col=0)

In [4]:
steam_data2.shape

(26356, 43)

In [5]:
steam_data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26356 entries, 0 to 27074
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   appid             26356 non-null  int64         
 1   name              26356 non-null  object        
 2   release_date      26356 non-null  datetime64[ns]
 3   yrs_released      26356 non-null  int64         
 4   winter            26356 non-null  int64         
 5   spring            26356 non-null  int64         
 6   summer            26356 non-null  int64         
 7   fall              26356 non-null  int64         
 8   achievements      26356 non-null  int64         
 9   average_playtime  26356 non-null  int64         
 10  median_playtime   26356 non-null  int64         
 11  price             26356 non-null  float64       
 12  avg_owners        26356 non-null  float64       
 13  revenue           26356 non-null  float64       
 14  total_ratings     2635

# 1. Scale Data

In [9]:
# Define X and y
X = steam_data2.drop(columns=['revenue', 'appid', 'name', 'release_date'])
y = steam_data2.revenue

In [7]:
X.shape, y.shape

((26356, 40), (26356,))

In [11]:
# Initialize scaler
scaler = StandardScaler()

# Fit and transform data
X_scaled = scaler.fit_transform(X)

In [12]:
X_scaled

array([[ 7.68482511, -0.56256868, -0.59534657, ..., -0.28109458,
        -0.2809374 ,  3.74630451],
       [ 8.15076602, -0.56256868,  1.6796939 , ..., -0.28109458,
        -0.2809374 ,  3.74630451],
       [ 6.28700239, -0.56256868,  1.6796939 , ..., -0.28109458,
        -0.2809374 , -0.26692972],
       ...,
       [-1.16805212, -0.56256868,  1.6796939 , ..., -0.28109458,
        -0.2809374 , -0.26692972],
       [-1.16805212, -0.56256868,  1.6796939 , ..., -0.28109458,
        -0.2809374 , -0.26692972],
       [-1.16805212, -0.56256868,  1.6796939 , ..., -0.28109458,
        -0.2809374 , -0.26692972]])

# 2. Split into Train/Test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

In [17]:
X_train.shape, X_test.shape

((18449, 39), (7907, 39))

In [18]:
X_train

array([[-0.23617031, -0.56256868, -0.59534657, ..., -0.28109458,
         3.55951183, -0.26692972],
       [ 0.69571151, -0.56256868,  1.6796939 , ..., -0.28109458,
        -0.2809374 , -0.26692972],
       [ 2.55947514, -0.56256868,  1.6796939 , ..., -0.28109458,
        -0.2809374 , -0.26692972],
       ...,
       [-0.23617031,  1.77756075, -0.59534657, ..., -0.28109458,
        -0.2809374 , -0.26692972],
       [-0.70211121, -0.56256868, -0.59534657, ..., -0.28109458,
        -0.2809374 , -0.26692972],
       [-0.70211121, -0.56256868,  1.6796939 , ..., -0.28109458,
        -0.2809374 , -0.26692972]])