In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

## Preparation of the variables

In [25]:
# Import of observations
df = pd.read_csv('formated_data_new.csv')
# Declaration of variables for the linear regressions
prices = df['Current price in €']
nb_reviews = df['Number of reviews']
positive_rate = df['Positive rate in reviews in %']
steamDB_score = df['SteamDB score in %']
# Use the log of the steamDB score to ease the interpretation of coefficients
# in linear regressions
l_steamDB_score = np.log(steamDB_score)
conf_class = df['Min configuration class']
dates = df['Release date']
genres = df['Genres']
n = len(prices)
const = np.ones(n)
# Definition of binary variables
class0 = conf_class==0
class1 = conf_class==1
class2 = conf_class==2
class3 = conf_class==3
action = genres.str.contains("Action")
adventure = genres.str.contains("Adventure")
rpg = genres.str.contains("RPG")
indie = genres.str.contains("Indie")
strategy = genres.str.contains("Strategy")
simulation = genres.str.contains("Simulation")
casual = genres.str.contains("Casual")
sports = genres.str.contains("Sports")
year_2016 = dates.str.contains("2016")
year_2017 = dates.str.contains("2017")
year_2018 = dates.str.contains("2018")
year_2019 = dates.str.contains("2019")
year_2020 = dates.str.contains("2020")
variables_names = ['const', 'prices', 'class1', 'class2', 'class3', 'adventure', 'rpg', \
                  'indie', 'strategy', 'simulation', 'casual', 'sports', 'year_2017', \
                  'year_2018', 'year_2019', 'year_2020']

## Linear regression with all observations

In [14]:
X = np.column_stack((const, prices, nb_reviews, positive_rate, conf_class))
results = sm.OLS(l_steamDB_score, X).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:     SteamDB score in %   R-squared:                       0.944
Model:                            OLS   Adj. R-squared:                  0.944
Method:                 Least Squares   F-statistic:                 7.175e+04
Date:                Mon, 18 Jan 2021   Prob (F-statistic):               0.00
Time:                        16:40:32   Log-Likelihood:                 25189.
No. Observations:               17130   AIC:                        -5.037e+04
Df Residuals:                   17125   BIC:                        -5.033e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.3348      0.002   2068.445      0.0

The SteamDB score depends both on the number of reviews and their positivity rate. Therefore, nb_reviews, positive_rate are correlated to the variable steamDB_score. We will then not use them in the rest of our study.

## Linear regression with binary variables

We use 12 binary variables:
- 4 variables for the configuration class: class0, class1, class2 and class3
- 8 variables for the genres: action, adventure, rpg, indie, strategy, simulation, casual and sports.

In [28]:
X_bin = np.column_stack((const, prices, class1, class2, class3, adventure, \
                         rpg, indie, strategy, simulation, casual, sports, \
                         year_2017, year_2018, year_2019, year_2020))
results_bin = sm.OLS(l_steamDB_score, X_bin).fit()
print(results_bin.summary(xname=variables_names))

                            OLS Regression Results                            
Dep. Variable:     SteamDB score in %   R-squared:                       0.034
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     40.03
Date:                Mon, 18 Jan 2021   Prob (F-statistic):          4.05e-116
Time:                        17:05:21   Log-Likelihood:                 842.91
No. Observations:               17130   AIC:                            -1654.
Df Residuals:                   17114   BIC:                            -1530.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.1071      0.007    559.235      0.0