In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

import os

pd.set_option('display.max_columns', 500)

1. ¿cuál es el PROBLEMA que queremos resolver? (de aquí saldrán las hipótesis y objetivos, entre más detalle mejor)

Se puede determinar cuanto tiempo va a jugar una persona con los valores de los datos?
Objetivo.- Visualizar los factores mas importantes para que un juego sea jugado por mucho tiempo

2. ¿cuál sería el RESULTADO ESPERADO del análisis? (aquí hay que pensar en ingeniería inversa; del resultado esperado, definiendo cada módulo que me hará llegar a ese resultado, hasta el planteamiento del problema, que es donde inicia el proyecto)

Esperamos observar varios juegos online o sin historia como aquellos con mas tiempo jugado. Ademas, estos juegos pertenecen a una empresa grande. 

3. ¿CÓMO pienso resolverlo? (QUÉ voy a ocupar y CÓMO pienso usarlo)

Mucha limpieza de datos
Regresion 

## Limpieza e ingenieria de datos

In [3]:
df_steam = pd.read_csv('steam-store-games/steam.csv')
# Dado que queremos predecir el tiempo de juego, solo tomaremos aquellos que tengo algun registro
df_steam = df_steam[df_steam['average_playtime'] > 0]
# Facilidad para manejar la columna posteriormente
df_steam.rename(columns={'steamspy_tags': 'tags'}, inplace=True)
# Columnas inutiles
df_steam.drop(['english', 'required_age'], axis=1, inplace=True)

In [4]:
dates = df_steam['release_date'].map(lambda x:x.split('-'))
df_steam['year'] = dates.map(lambda x:x[0])
df_steam['month'] = dates.map(lambda x:x[1])
df_steam.drop('release_date', axis=1, inplace=True)

In [5]:
# Obtener el porcentaje del publico que gusto del juego y cuantos votaron
df_steam['rating'] = df_steam['positive_ratings'] / (df_steam['positive_ratings'] + df_steam['negative_ratings']) * 100
df_steam['rat_count'] = df_steam['positive_ratings'] + df_steam['negative_ratings']
df_steam.drop(['positive_ratings', 'negative_ratings'], axis=1, inplace=True)

In [6]:
# Pasar a cubetas la cantidad de dueños del juego
owners = ['0-20000', '20000-50000', '50000-100000', '100000-200000', '200000-500000', '500000-1000000', '1000000-2000000', '2000000-5000000', '5000000-10000000', '10000000-20000000', '20000000-50000000', '50000000-100000000', '100000000-200000000']
bucket_own = dict([(a, 1) for a in owners[:5]] + [(a, 2) for a in owners[5:9]] + [(a, 3) for a in owners[9:]])
df_steam.owners = df_steam.owners.replace(bucket_own)

In [7]:
# Todos los juegos se puede jugar en windows, por lo que se descarta
# Nos interesa saber si se pueden jugar en linux o mac
df_steam['linux'] = df_steam.platforms.map(lambda x:x.find('linux') >= 0)
df_steam['mac'] = df_steam.platforms.map(lambda x:x.find('mac') >= 0)
df_steam.drop('platforms', axis=1, inplace=True)

In [8]:
# Esta funcion permite obtener todas las palabras utlizadas para clasificar los juegos
# Si esta tiene un minimo de apariciones, se considerara como una palabra destacada
def extract_relevant_words(feature, min_samples=5):
    all_words = []
    for word in df_steam[feature].map(lambda x:x.split(';')):
        all_words += word
    all_words = pd.Series(all_words).value_counts()
    all_words = all_words[all_words > min_samples]
    return all_words

In [9]:
# One hot encoding de las palabras que clasifican el juego
# Agrega todas las columnas posibles y resalta las categorias en las que entra
def add_nominal_features(df, feature, words):
    new_words = df_steam[feature].map(lambda x:[w for w in x.split(';') if w in words])
    for w in words:
        df[feature[:3] + '_' + w] = 0
    for idx, words in enumerate(new_words):
        for w in words:
            df[feature[:3] + '_' + w].iloc[idx] = 1
    df.drop(feature, axis=1, inplace=True)
    return df

In [10]:
# Se obtienen las palabras relevantes de las categorias
all_dev = extract_relevant_words('developer', 10)
all_pub = extract_relevant_words('publisher', 20)
all_cat = extract_relevant_words('categories', 100)
all_gen = extract_relevant_words('genres')
all_tag = extract_relevant_words('tags', 50)
len(all_dev), len(all_pub), len(all_cat), len(all_gen), len(all_tag)

(33, 38, 21, 20, 54)

In [11]:
# Se agregan las columnas de las categorias como one hot encoding
print(df_steam.shape)
df_steam = add_nominal_features(df_steam, 'developer', all_dev.index)
print(df_steam.shape)
df_steam = add_nominal_features(df_steam, 'publisher', all_pub.index)
print(df_steam.shape)
df_steam = add_nominal_features(df_steam, 'categories', all_cat.index)
print(df_steam.shape)
df_steam = add_nominal_features(df_steam, 'genres', all_gen.index)
print(df_steam.shape)
df_steam = add_nominal_features(df_steam, 'tags', all_tag.index)
print(df_steam.shape)

(6170, 18)
(6170, 50)
(6170, 87)
(6170, 107)
(6170, 126)
(6170, 179)


In [12]:
df_steam.head()

Unnamed: 0,appid,name,achievements,average_playtime,median_playtime,owners,price,year,month,rating,rat_count,linux,mac,dev_Feral Interactive (Mac),dev_Valve,dev_EnsenaSoft,dev_Feral Interactive (Linux),dev_Square Enix,dev_Ubisoft Montreal,dev_Daedalic Entertainment,"dev_KOEI TECMO GAMES CO., LTD.",dev_Winged Cloud,dev_Aspyr (Mac),dev_id Software,dev_LucasArts,dev_Team17 Digital Ltd,dev_Just1337 Studio,dev_Double Fine Productions,dev_Capcom,dev_Bohemia Interactive,"dev_CAPCOM Co., Ltd.",dev_Firaxis Games,dev_Quiet River,dev_Forever Entertainment S. A.,dev_Creobit,dev_Arc System Works,dev_Relic Entertainment,dev_Gearbox Software,dev_Traveller's Tales,dev_Rebellion,dev_Crystal Dynamics,dev_Paradox Development Studio,dev_Jetdogs Studios,dev_Amaterasu Software,dev_CREATIVE ASSEMBLY,dev_Nival,pub_Ubisoft,pub_Square Enix,pub_SEGA,pub_Devolver Digital,pub_KISS ltd,pub_THQ Nordic,pub_BANDAI NAMCO Entertainment,pub_Paradox Interactive,pub_Back To Basics Gaming,pub_Focus Home Interactive,pub_Feral Interactive (Mac),pub_Sometimes You,pub_Sekai Project,pub_2K,pub_Dagestan Technology,pub_Daedalic Entertainment,pub_Disney Interactive,pub_Bethesda Softworks,pub_New Reality Games,pub_Kalypso Media Digital,pub_Team17 Digital Ltd,pub_Valve,pub_Electronic Arts,pub_Forever Entertainment S. A.,pub_Activision,pub_LucasArts,pub_Deep Silver,pub_Lucasfilm,pub_AGM PLAYISM,pub_Atriagames,pub_Capcom,pub_EnsenaSoft,pub_Warner Bros. Interactive Entertainment,pub_Degica,pub_1C Entertainment,pub_tinyBuild,pub_Codemasters,pub_Feral Interactive (Linux),cat_Single-player,cat_Steam Achievements,cat_Steam Trading Cards,cat_Steam Cloud,cat_Full controller support,cat_Multi-player,cat_Partial Controller Support,cat_Steam Leaderboards,cat_Online Multi-Player,cat_Co-op,cat_Stats,cat_Shared/Split Screen,cat_Steam Workshop,cat_Cross-Platform Multiplayer,cat_Includes level editor,cat_Online Co-op,cat_In-App Purchases,cat_Local Multi-Player,cat_Captions available,cat_MMO,cat_Local Co-op,gen_Indie,gen_Action,gen_Adventure,gen_Casual,gen_Strategy,gen_RPG,gen_Simulation,gen_Free to Play,gen_Early Access,gen_Massively Multiplayer,gen_Sports,gen_Racing,gen_Violent,gen_Gore,gen_Nudity,gen_Sexual Content,gen_Utilities,gen_Design & Illustration,gen_Animation & Modeling,gen_Web Publishing,tag_Indie,tag_Action,tag_Adventure,tag_Casual,tag_Strategy,tag_RPG,tag_Free to Play,tag_Simulation,tag_Early Access,tag_Multiplayer,tag_Puzzle,tag_Anime,tag_FPS,tag_Platformer,tag_Open World,tag_Horror,tag_Visual Novel,tag_Racing,tag_Point & Click,tag_Survival,tag_Massively Multiplayer,tag_Sports,tag_Nudity,tag_Pixel Graphics,tag_Sexual Content,tag_Space,tag_RTS,tag_Zombies,tag_Story Rich,tag_Sci-fi,tag_Female Protagonist,tag_Co-op,tag_Classic,tag_VR,tag_Shoot 'Em Up,tag_Gore,tag_Tower Defense,tag_Turn-Based,tag_Great Soundtrack,tag_Violent,tag_Arcade,tag_Fantasy,tag_Fighting,tag_Stealth,tag_Card Game,tag_Management,tag_Singleplayer,tag_World War II,tag_Turn-Based Strategy,tag_Hack and Slash,tag_JRPG,tag_Rogue-like,tag_RPGMaker,tag_Sandbox
0,10,Counter-Strike,0,17612,317,3,7.19,2000,11,97.388815,127873,True,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,20,Team Fortress Classic,0,277,62,2,3.99,1999,4,83.97874,3951,True,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,30,Day of Defeat,0,187,34,2,3.99,2003,5,89.564761,3814,True,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,40,Deathmatch Classic,0,258,184,2,3.99,2001,6,82.662338,1540,True,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,50,Half-Life: Opposing Force,0,624,415,2,3.99,1999,11,94.799567,5538,True,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
df_steam.describe(percentiles=[.05, .25, .5, .75, .95])

Unnamed: 0,appid,achievements,average_playtime,median_playtime,owners,price,rating,rat_count,dev_Feral Interactive (Mac),dev_Valve,dev_EnsenaSoft,dev_Feral Interactive (Linux),dev_Square Enix,dev_Ubisoft Montreal,dev_Daedalic Entertainment,"dev_KOEI TECMO GAMES CO., LTD.",dev_Winged Cloud,dev_Aspyr (Mac),dev_id Software,dev_LucasArts,dev_Team17 Digital Ltd,dev_Just1337 Studio,dev_Double Fine Productions,dev_Capcom,dev_Bohemia Interactive,"dev_CAPCOM Co., Ltd.",dev_Firaxis Games,dev_Quiet River,dev_Forever Entertainment S. A.,dev_Creobit,dev_Arc System Works,dev_Relic Entertainment,dev_Gearbox Software,dev_Traveller's Tales,dev_Rebellion,dev_Crystal Dynamics,dev_Paradox Development Studio,dev_Jetdogs Studios,dev_Amaterasu Software,dev_CREATIVE ASSEMBLY,dev_Nival,pub_Ubisoft,pub_Square Enix,pub_SEGA,pub_Devolver Digital,pub_KISS ltd,pub_THQ Nordic,pub_BANDAI NAMCO Entertainment,pub_Paradox Interactive,pub_Back To Basics Gaming,pub_Focus Home Interactive,pub_Feral Interactive (Mac),pub_Sometimes You,pub_Sekai Project,pub_2K,pub_Dagestan Technology,pub_Daedalic Entertainment,pub_Disney Interactive,pub_Bethesda Softworks,pub_New Reality Games,pub_Kalypso Media Digital,pub_Team17 Digital Ltd,pub_Valve,pub_Electronic Arts,pub_Forever Entertainment S. A.,pub_Activision,pub_LucasArts,pub_Deep Silver,pub_Lucasfilm,pub_AGM PLAYISM,pub_Atriagames,pub_Capcom,pub_EnsenaSoft,pub_Warner Bros. Interactive Entertainment,pub_Degica,pub_1C Entertainment,pub_tinyBuild,pub_Codemasters,pub_Feral Interactive (Linux),cat_Single-player,cat_Steam Achievements,cat_Steam Trading Cards,cat_Steam Cloud,cat_Full controller support,cat_Multi-player,cat_Partial Controller Support,cat_Steam Leaderboards,cat_Online Multi-Player,cat_Co-op,cat_Stats,cat_Shared/Split Screen,cat_Steam Workshop,cat_Cross-Platform Multiplayer,cat_Includes level editor,cat_Online Co-op,cat_In-App Purchases,cat_Local Multi-Player,cat_Captions available,cat_MMO,cat_Local Co-op,gen_Indie,gen_Action,gen_Adventure,gen_Casual,gen_Strategy,gen_RPG,gen_Simulation,gen_Free to Play,gen_Early Access,gen_Massively Multiplayer,gen_Sports,gen_Racing,gen_Violent,gen_Gore,gen_Nudity,gen_Sexual Content,gen_Utilities,gen_Design & Illustration,gen_Animation & Modeling,gen_Web Publishing,tag_Indie,tag_Action,tag_Adventure,tag_Casual,tag_Strategy,tag_RPG,tag_Free to Play,tag_Simulation,tag_Early Access,tag_Multiplayer,tag_Puzzle,tag_Anime,tag_FPS,tag_Platformer,tag_Open World,tag_Horror,tag_Visual Novel,tag_Racing,tag_Point & Click,tag_Survival,tag_Massively Multiplayer,tag_Sports,tag_Nudity,tag_Pixel Graphics,tag_Sexual Content,tag_Space,tag_RTS,tag_Zombies,tag_Story Rich,tag_Sci-fi,tag_Female Protagonist,tag_Co-op,tag_Classic,tag_VR,tag_Shoot 'Em Up,tag_Gore,tag_Tower Defense,tag_Turn-Based,tag_Great Soundtrack,tag_Violent,tag_Arcade,tag_Fantasy,tag_Fighting,tag_Stealth,tag_Card Game,tag_Management,tag_Singleplayer,tag_World War II,tag_Turn-Based Strategy,tag_Hack and Slash,tag_JRPG,tag_Rogue-like,tag_RPGMaker,tag_Sandbox
count,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0,6170.0
mean,400043.0,46.662723,657.369368,640.918476,1.176013,7.47211,73.824814,5039.625,0.006807,0.004376,0.004214,0.003404,0.003404,0.003079,0.003079,0.002917,0.002755,0.002593,0.002593,0.002593,0.002593,0.002431,0.002269,0.002107,0.002107,0.002107,0.002107,0.001945,0.001945,0.001945,0.001945,0.001945,0.001783,0.001783,0.001783,0.001783,0.001783,0.001783,0.001783,0.001783,0.001783,0.014587,0.012966,0.011507,0.009724,0.009562,0.007942,0.007293,0.007131,0.006969,0.006807,0.006807,0.006483,0.006321,0.006321,0.005997,0.005835,0.005511,0.005511,0.005348,0.005186,0.005024,0.004862,0.0047,0.0047,0.004538,0.004538,0.004376,0.004376,0.004376,0.004376,0.004214,0.004214,0.003728,0.003728,0.003728,0.003566,0.003566,0.003404,0.927553,0.634846,0.622204,0.387196,0.267585,0.237277,0.18671,0.172285,0.120097,0.113128,0.09141,0.08752,0.070827,0.069044,0.063047,0.063047,0.052512,0.04684,0.036791,0.035981,0.033874,0.655592,0.497083,0.369206,0.283955,0.212318,0.193841,0.171313,0.108914,0.057536,0.050405,0.034846,0.033549,0.023015,0.015883,0.012642,0.010373,0.003566,0.002107,0.001783,0.001135,0.45235,0.35316,0.221718,0.191734,0.153971,0.118152,0.107942,0.094814,0.057536,0.049595,0.049109,0.044895,0.041977,0.039546,0.032253,0.032091,0.028525,0.027553,0.023501,0.023015,0.022366,0.020421,0.020421,0.019611,0.018639,0.015721,0.015559,0.015559,0.014749,0.014587,0.014425,0.014263,0.014263,0.014263,0.013938,0.013128,0.012804,0.01248,0.011669,0.011345,0.011021,0.010697,0.010373,0.010211,0.009887,0.009562,0.0094,0.0094,0.009076,0.008752,0.00859,0.00859,0.008266,0.008266
std,206413.9,276.124639,3783.66603,4898.936256,0.392185,8.503212,17.705728,46782.93,0.082231,0.066012,0.064783,0.058245,0.058245,0.055411,0.055411,0.053938,0.052423,0.050861,0.050861,0.050861,0.050861,0.04925,0.047584,0.045857,0.045857,0.045857,0.045857,0.044062,0.044062,0.044062,0.044062,0.044062,0.042189,0.042189,0.042189,0.042189,0.042189,0.042189,0.042189,0.042189,0.042189,0.119901,0.113137,0.106662,0.09814,0.097327,0.088769,0.085096,0.084152,0.083197,0.082231,0.082231,0.080262,0.079259,0.079259,0.077212,0.076168,0.074034,0.074034,0.072943,0.071835,0.07071,0.069566,0.068402,0.068402,0.067218,0.067218,0.066012,0.066012,0.066012,0.066012,0.064783,0.064783,0.060946,0.060946,0.060946,0.059611,0.059611,0.058245,0.259248,0.481512,0.484875,0.487149,0.442736,0.425448,0.38971,0.377659,0.325102,0.316775,0.288215,0.282619,0.256556,0.253549,0.243067,0.243067,0.223076,0.211312,0.188263,0.186257,0.180918,0.475213,0.500032,0.482629,0.450952,0.408982,0.395338,0.376813,0.311557,0.232884,0.218797,0.183405,0.180081,0.149962,0.125034,0.111732,0.101325,0.059611,0.045857,0.042189,0.033666,0.497765,0.477991,0.415436,0.393697,0.36095,0.322815,0.310332,0.292981,0.232884,0.217124,0.216112,0.20709,0.200554,0.194906,0.176685,0.176255,0.166481,0.163701,0.1515,0.149962,0.147884,0.141448,0.141448,0.138671,0.135256,0.124405,0.123772,0.123772,0.120555,0.119901,0.119243,0.118581,0.118581,0.118581,0.117245,0.113832,0.112437,0.111022,0.107401,0.105917,0.10441,0.10288,0.101325,0.100539,0.098946,0.097327,0.096506,0.096506,0.094843,0.09315,0.092291,0.092291,0.090547,0.090547
min,10.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,22189.0,0.0,4.0,4.0,1.0,0.0,39.034191,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,265950.0,0.0,77.0,77.0,1.0,1.59,63.165967,164.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,388705.0,15.0,222.0,228.0,1.0,4.99,77.777778,553.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,539595.0,35.0,365.0,371.0,1.0,10.99,87.953881,2142.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95%,760538.5,90.0,1931.1,1611.5,2.0,24.99,95.415466,15917.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1051310.0,5000.0,190625.0,190625.0,3.0,114.99,100.0,3046717.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Modelo

In [14]:
def get_train_test(scaler = 'StandardScaler'):
    X = df_steam.copy()
    name = X.pop('name')
    y = X.pop('average_playtime'); X.drop('median_playtime', axis=1, inplace=True)
    # y = X.pop('median_playtime'); X.drop('average_playtime', axis=1, inplace=True)
    
    Xt, Xv, yt, yv = train_test_split(X, y, random_state = 12)
    if scaler == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler == 'MinMaxScaler':
        scaler = MinMaxScaler()    
    Xt = pd.DataFrame(scaler.fit_transform(Xt, yt), columns=Xt.columns)
    Xv = pd.DataFrame(scaler.transform(Xv), columns=Xv.columns)
    
    return Xt, Xv, yt, yv

## Feature selection

### Backward elimination

In [15]:
Xt, Xv, yt, yv =  get_train_test()

X_new = Xt.copy()
y_ = list(yt.copy())
backward_features = list(X_new.columns)

while(len(backward_features) > 0):
    X_ = X_new[backward_features]
    X_ = sm.add_constant(X_)
    # Ordinary Least Squares (Linear regression)
    model = sm.regression.linear_model.OLS(y_, X_).fit()
    # pvalues
    p = pd.Series(model.pvalues.values[1:], index=backward_features)
    if(max(p) > 0.05):
        backward_features.remove(p.idxmax())
    else:
        break

Xt = Xt[backward_features]
Xv = Xv[backward_features]
print(len(backward_features))
print(backward_features)

# poly_fea = PolynomialFeatures(degree=1)
# Xt = poly_fea.fit_transform(Xt, yt)
# Xv = poly_fea.transform(Xv)

lin_reg = LinearRegression().fit(Xt, yt)

np.sqrt(mean_squared_error(lin_reg.predict(Xt), yt)), np.sqrt(mean_squared_error(lin_reg.predict(Xv), yv))

14
['achievements', 'price', 'rat_count', 'mac', 'dev_Bohemia Interactive', 'pub_Square Enix', 'cat_Full controller support', 'cat_Cross-Platform Multiplayer', 'cat_In-App Purchases', 'tag_RPG', 'tag_Free to Play', 'tag_Open World', 'tag_Massively Multiplayer', 'tag_Sandbox']


(3759.999681270342, 3504.6789869136505)

In [16]:
pd.DataFrame({'prediction': lin_reg.predict(Xv[:5]), 'value': yv[:5].values})

Unnamed: 0,prediction,value
0,1076.711994,505
1,216.84872,951
2,615.140328,239
3,1433.232497,1069
4,335.908924,121


### SelectKBest

In [17]:
Xt, Xv, yt, yv =  get_train_test('MinMaxScaler')

X_ = Xt.copy()
y_ = yt.copy()
k = 15
# selector = SelectKBest(chi2, k=k).fit(X_, y_)
selector = SelectKBest(mutual_info_regression, k=k).fit(X_, y_)
cols = selector.get_support(indices=True)
best_features = X_.columns[cols]
# kbest = pd.Series(selector.scores_, index=X_.columns).sort_values(ascending=False)[:k]
# kbest.plot(kind="bar")

Xt = Xt[best_features]
Xv = Xv[best_features]
print(k)
print(best_features)

# poly_fea = PolynomialFeatures(degree=1)
# Xt = poly_fea.fit_transform(Xt, yt)
# Xv = poly_fea.transform(Xv)

lin_reg = LinearRegression().fit(Xt, yt)

np.sqrt(mean_squared_error(lin_reg.predict(Xt), yt)), np.sqrt(mean_squared_error(lin_reg.predict(Xv), yv))

15
Index(['appid', 'achievements', 'owners', 'price', 'year', 'rating',
       'rat_count', 'cat_Steam Trading Cards', 'cat_Online Multi-Player',
       'gen_Indie', 'gen_RPG', 'gen_Free to Play', 'tag_Indie',
       'tag_Free to Play', 'tag_Female Protagonist'],
      dtype='object')


(3781.375491800862, 3481.795137220516)

In [18]:
pd.DataFrame({'prediction': lin_reg.predict(Xv[:5]), 'value': yv[:5].values})

Unnamed: 0,prediction,value
0,1124.931557,505
1,351.418397,951
2,491.942297,239
3,1556.149121,1069
4,110.275777,121


### Recursive Feature Elimination

In [19]:
Xt, Xv, yt, yv =  get_train_test()

X_ = Xt.copy()
high_score = 0
index = 0
for ite in range(X_.shape[1]):
    model = LinearRegression()
    rfe = RFE(model, ite+1)
    model.fit(rfe.fit_transform(Xt, yt), yt)
    score = model.score(rfe.transform(Xv), yv)
    if high_score < score:
        high_score = score
        index = ite+1
high_score, index

(0.04360047996499228, 3)

In [20]:
model = LinearRegression()
rfe = RFE(model, index)
model.fit(rfe.fit_transform(Xt, yt), yt)
rfe_features = pd.Series(rfe.support_, index=X_.columns)
X_new = X_[rfe_features[rfe_features == True].index]

Xt = Xt[X_new.columns]
Xv = Xv[X_new.columns]

len(X_new.columns), np.sqrt(mean_squared_error(model.predict(Xt), yt)), np.sqrt(mean_squared_error(model.predict(Xv), yv))

(3, 3795.7428840458747, 3459.5870392404804)

In [21]:
X_new.head()

Unnamed: 0,price,rat_count,tag_Free to Play
0,-0.403502,-0.025368,-0.352349
1,0.882846,-0.056933,-0.352349
2,0.532024,-0.09356,-0.352349
3,-0.403502,-0.091566,-0.352349
4,-0.672465,-0.102394,-0.352349


In [22]:
pd.DataFrame({'prediction': model.predict(Xv[:5]), 'value': yv[:5].values})

Unnamed: 0,prediction,value
0,815.067039,505
1,497.008568,951
2,293.169339,239
3,1667.062408,1069
4,264.914982,121


### Embedded method

In [23]:
Xt, Xv, yt, yv =  get_train_test()

model = LinearRegression().fit(Xt, yt)
coef = pd.Series(model.coef_, index = Xt.columns)
sum(coef != 0), np.sqrt(mean_squared_error(model.predict(Xt), yt)), np.sqrt(mean_squared_error(model.predict(Xv), yv))

(176, 3734.174463339077, 3535.3143286174136)

In [24]:
pd.DataFrame({'prediction': model.predict(Xv[:5]), 'value': yv[:5].values})

Unnamed: 0,prediction,value
0,552.818173,505
1,-25.812442,951
2,553.567053,239
3,2133.873197,1069
4,480.547282,121
