# Прогнозирование успешности игры в следующем году.

**Цель исследования** - выявить закономерности успешность игры и составить прогноз на следующий год.

## 1. Обзор данных

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats as st

In [10]:
try:
    df = pd.read_csv('games.csv')
except:
    df = pd.read_csv('datasets/games.csv')

In [12]:
df.head(10)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_sales,EU_sales,JP_sales,Other_sales,Critic_Score,User_Score,Rating
0,Wii Sports,Wii,2006.0,Sports,41.36,28.96,3.77,8.45,76.0,8.0,E
1,Super Mario Bros.,NES,1985.0,Platform,29.08,3.58,6.81,0.77,,,
2,Mario Kart Wii,Wii,2008.0,Racing,15.68,12.76,3.79,3.29,82.0,8.3,E
3,Wii Sports Resort,Wii,2009.0,Sports,15.61,10.93,3.28,2.95,80.0,8.0,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,11.27,8.89,10.22,1.0,,,
5,Tetris,GB,1989.0,Puzzle,23.2,2.26,4.22,0.58,,,
6,New Super Mario Bros.,DS,2006.0,Platform,11.28,9.14,6.5,2.88,89.0,8.5,E
7,Wii Play,Wii,2006.0,Misc,13.96,9.18,2.93,2.84,58.0,6.6,E
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,14.44,6.94,4.7,2.24,87.0,8.4,E
9,Duck Hunt,NES,1984.0,Shooter,26.93,0.63,0.28,0.47,,,


In [14]:
df.shape

(16715, 11)

## 2. Предобработка данных

#### 2.1. Привёдем названия столбцов к нижнему регистру

In [17]:
df.columns

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'NA_sales', 'EU_sales',
       'JP_sales', 'Other_sales', 'Critic_Score', 'User_Score', 'Rating'],
      dtype='object')

In [19]:
df.columns = df.columns.str.lower()

In [21]:
df.columns

Index(['name', 'platform', 'year_of_release', 'genre', 'na_sales', 'eu_sales',
       'jp_sales', 'other_sales', 'critic_score', 'user_score', 'rating'],
      dtype='object')

#### 2.2 Избавление от дубликатов

In [23]:
df.duplicated().sum()

0

Дубликатов нету

#### 2.3. Обработаем пропуски

In [25]:
df.isna().sum()

name                  2
platform              0
year_of_release     269
genre                 2
na_sales              0
eu_sales              0
jp_sales              0
other_sales           0
critic_score       8578
user_score         6701
rating             6766
dtype: int64

Пропуски в `name`, `year_of_release`, `genre` можно удалить

`critic_score`, `user_score`, `rating` заполним нулями т.к. люди могут играть и не ставить оценки, но данные по этим играм значимые.

In [27]:
df = df.dropna(subset = ['name', 'year_of_release', 'genre'])
df['critic_score'].fillna(0, inplace = True)
df['rating'].fillna('0', inplace = True)
df['user_score'].fillna('0', inplace = True)

df['user_score'] = df['user_score'].replace({'tbd' : '0'})

#### 2.4. Приведение данных к правильным типам

In [29]:
df.dtypes

name                object
platform            object
year_of_release    float64
genre               object
na_sales           float64
eu_sales           float64
jp_sales           float64
other_sales        float64
critic_score       float64
user_score          object
rating              object
dtype: object

`year_of_release` приведем к int

`user_score` - оценивается в 10-балльной шкале, приведем к float

In [31]:
df['year_of_release'] = df['year_of_release'].astype('int64')
df['user_score'] = df['user_score'].astype('float64')

#### 2.5 Добавим отдельный столбец - суммарные продажи во всех регионах

In [33]:
def summ(row):
    return row[['na_sales', 'eu_sales', 'jp_sales', 'other_sales']].sum()

df['all_sales'] = df.apply(summ, axis = 1)

In [35]:
df.columns

Index(['name', 'platform', 'year_of_release', 'genre', 'na_sales', 'eu_sales',
       'jp_sales', 'other_sales', 'critic_score', 'user_score', 'rating',
       'all_sales'],
      dtype='object')

In [36]:
df = df[['name', 'platform', 'year_of_release', 'genre', 'na_sales',
       'eu_sales', 'jp_sales', 'other_sales', 'all_sales', 'critic_score',
       'user_score', 'rating']]

In [38]:
df.sample(10)

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,all_sales,critic_score,user_score,rating
12997,Pachinko Wars II,SNES,1993,Misc,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0
1887,WWE SmackDown vs. Raw 2011,PS3,2010,Fighting,0.44,0.46,0.01,0.18,1.09,74.0,6.9,T
3908,Iron Man,DS,2008,Action,0.35,0.12,0.0,0.05,0.52,56.0,7.0,E10+
2726,LEGO Batman 2: DC Super Heroes,3DS,2012,Action,0.41,0.28,0.0,0.06,0.75,72.0,5.2,E10+
13316,Pachitte Chonmage Tatsujin 15: Pachinko Fuyu n...,PS2,2008,Misc,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0
3905,Rugrats: Scavenger Hunt,N64,1999,Misc,0.41,0.09,0.0,0.01,0.51,0.0,0.0,0
4029,Power Rangers: Dino Thunder,GBA,2003,Action,0.35,0.13,0.0,0.01,0.49,49.0,0.0,E
3930,River Raid II,2600,1988,Shooter,0.47,0.03,0.0,0.01,0.51,0.0,0.0,0
7894,College Slam,PS,1995,Sports,0.1,0.07,0.0,0.01,0.18,0.0,0.0,0
3651,SD Gundam G Generation Zero,PS,1999,Strategy,0.0,0.0,0.51,0.04,0.55,0.0,0.0,0
