# Задача "Выход из он-лайн игры"

В этой задаче необходимо научиться предсказывать, остается ли участник в он-лайн игре или уходит из нее. Уходом считается отсутствие его в игре в течение недели.

 

Всего используется 12 признаков, вычисленных за 2 предыдущие недели:

- maxPlayerLevel - максимальный уровень игры, который прошел игрок
- numberOfAttemptedLevels - количество уровней, которые попытался пройти игрок
- attemptsOnTheHighestLevel - число попыток, сделанных на самом высоком уровне
- totalNumOfAttempts - общее число попыток
- averageNumOfTurnsPerCompletedLevel - среднее количество ходов, выполненных на успешно пройденных уровнях
- doReturnOnLowerLevels - делал ли игрок возвраты к игре на уже пройденных уровнях
- numberOfBoostersUsed - количество использованных бустеров
- fractionOfUsefullBoosters - количество бустеров, использованных во время успешных попыток (игрок прошел уровнь)
- totalScore - общее количество набранных очков
- totalBonusScore - общее количество набранных бонусных очков
- totalStarsCount - общее количество набранных звезд
- numberOfDaysActuallyPlayed - количество дней, когда пользователь играл в игру

В качестве ответа для данной задачи принимается текстовый файл, каждая строка которого соответствует строке в файле x_test.csv и содержит значение от 0 до 1 (вероятность того, что пользователь останется в игре). В качестве критерия качества решения задачи используется <b>логарифмическая функция потерь (<i>logloss</i>)</b>.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import datetime

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
x_train = pd.read_csv('x_train.csv',sep=';')
x_test = pd.read_csv('x_test.csv',sep=';')
y_train = pd.read_csv('y_train.csv',names=['result'])

In [3]:
x_train.head(2)

Unnamed: 0,maxPlayerLevel,numberOfAttemptedLevels,attemptsOnTheHighestLevel,totalNumOfAttempts,averageNumOfTurnsPerCompletedLevel,doReturnOnLowerLevels,numberOfBoostersUsed,fractionOfUsefullBoosters,totalScore,totalBonusScore,totalStarsCount,numberOfDaysActuallyPlayed
0,39,10,3,17,24.444444,1,5,0.4,2650000,1375,21,2
1,21,22,19,55,17.045455,1,6,0.333333,5614000,3825,51,4


In [4]:
y_train.head(2)

Unnamed: 0,result
0,0
1,1


In [5]:
x_train.shape, x_test.shape

((25289, 12), (25289, 12))

In [6]:
y_train.result.sum(),y_train.shape

(7288, (25289, 1))

In [7]:
x_train.describe()

Unnamed: 0,maxPlayerLevel,numberOfAttemptedLevels,attemptsOnTheHighestLevel,totalNumOfAttempts,averageNumOfTurnsPerCompletedLevel,doReturnOnLowerLevels,numberOfBoostersUsed,fractionOfUsefullBoosters,totalScore,totalBonusScore,totalStarsCount,numberOfDaysActuallyPlayed
count,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0
mean,18.424888,12.415991,3.507612,26.841393,13.330451,0.152438,4.901222,0.534841,2884268.0,2138.215232,26.884495,2.725058
std,20.172964,12.991894,6.766648,39.993029,6.657224,0.359452,6.630469,0.402091,3415374.0,2169.987109,29.403565,2.76535
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,4.0,3.0,1.0,4.0,8.4,0.0,0.0,0.0,483000.0,575.0,6.0,1.0
50%,11.0,8.0,1.0,12.0,14.555556,0.0,2.0,0.619048,1700000.0,1500.0,18.0,1.0
75%,27.0,16.0,3.0,31.0,17.72,0.0,6.0,1.0,3890000.0,2875.0,37.0,3.0
max,146.0,142.0,186.0,563.0,40.0,1.0,81.0,1.0,31338000.0,24275.0,319.0,14.0


In [8]:
x_test.describe()

Unnamed: 0,maxPlayerLevel,numberOfAttemptedLevels,attemptsOnTheHighestLevel,totalNumOfAttempts,averageNumOfTurnsPerCompletedLevel,doReturnOnLowerLevels,numberOfBoostersUsed,fractionOfUsefullBoosters,totalScore,totalBonusScore,totalStarsCount,numberOfDaysActuallyPlayed
count,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0,25289.0
mean,18.301119,12.41176,3.37633,26.587251,13.351319,0.1505,4.860295,0.539779,2881780.0,2139.874056,26.903634,2.695994
std,19.855935,12.905114,6.51735,39.384819,6.636256,0.357568,6.544325,0.401052,3376233.0,2159.860948,29.255567,2.735989
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,4.0,3.0,1.0,4.0,8.5,0.0,0.0,0.0,524000.0,650.0,6.0,1.0
50%,11.0,8.0,1.0,12.0,14.538462,0.0,2.0,0.625,1734000.0,1500.0,18.0,1.0
75%,27.0,16.0,3.0,31.0,17.7,0.0,6.0,1.0,3898000.0,2850.0,37.0,3.0
max,144.0,145.0,365.0,480.0,48.0,1.0,86.0,1.0,34767000.0,26125.0,344.0,14.0


In [9]:
#x_train['result'] = y_train

In [10]:
#g = sns.pairplot(x_train,hue='result')

In [11]:
for n,i in enumerate(x_train.columns[:11]):
    for j in x_train.columns[n+1:11]:
        x_train[j+'_div_'+i] = x_train[j]/(x_train[i]+0.01)

In [12]:
for n,i in enumerate(x_test.columns[:11]):
    for j in x_test.columns[n+1:11]:
        x_test[j+'_div_'+i] = x_test[j]/(x_test[i]+0.01)

In [13]:
for i in x_train.columns:
    x_train['log_'+str(i)] = np.log(x_train[i]+0.01)

In [14]:
for i in x_test.columns:
    x_test['log_'+str(i)] = np.log(x_test[i]+0.01)

In [15]:
poly = PolynomialFeatures(2)

In [21]:
# baseline
for i in range(3,12,1):
    print "for depth={}".format(i)
    #clf = GradientBoostingClassifier(n_estimators=100,random_state=i)
    #clf = RandomForestClassifier(n_estimators=210,max_depth=7,min_samples_leaf=24)
    clf = RandomForestClassifier(n_estimators=230,max_depth=i,min_samples_leaf=38,n_jobs=-1)
    print "scored {}".format(cross_val_score(clf,poly.fit_transform(x_train),y_train.result,
                                             n_jobs=-1,scoring='neg_log_loss').mean())

for depth=3
scored -0.386980830402
for depth=4
scored -0.383901866548
for depth=5
scored -0.382354514354
for depth=6
scored -0.381869693638
for depth=7
scored -0.38179435619
for depth=8
scored -0.382229795329
for depth=9


Process PoolWorker-296:
Traceback (most recent call last):
  File "/home/ubuntu/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda/lib/python2.7/multiprocessing/pool.py", line 102, in worker
    task = get()
  File "/home/ubuntu/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 362, in get
    return recv()
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
clf = RandomForestClassifier(n_estimators=240,max_depth=7,min_samples_leaf=41)
clf.fit(poly.fit_transform(x_train),y_train.result)

In [None]:
feat = pd.DataFrame([clf.feature_importances_,poly.get_feature_names()]).transpose()[1:]
feat.columns = ['importance','feature']

In [None]:
feat.sort_values(by=['importance'],ascending=False)[:10]

In [None]:
# 0,3846047
# без фичей деления
preds = []
for i in [10,90,140]:
    print "for random state={}".format(i)
    clf = GradientBoostingClassifier(n_estimators=100,random_state=i)
    clf.fit(x_train,y_train.result)
    preds.append(clf.predict_proba(x_test))

In [None]:
# 0,3841466
# без фичей деления
preds = []
for i in [100,130]:
    print "for random state={}".format(i)
    clf = RandomForestClassifier(n_estimators=140,max_depth=8,random_state=i)
    clf.fit(x_train,y_train.result)
    preds.append(clf.predict_proba(x_test))

In [None]:
# 0,3842144
# без фичей деления
preds = []
for i in [50,60,100,130]:
    print "for random state={}".format(i)
    clf = RandomForestClassifier(n_estimators=140,max_depth=8,random_state=i)
    clf.fit(x_train,y_train.result)
    preds.append(clf.predict_proba(x_test))

In [None]:
# 0,3838713 - переобучение (cv работает неверно - 0.381624540358)
# без фичей деления
preds = []

clf = RandomForestClassifier(n_estimators=240,max_depth=7,min_samples_leaf=41)
clf.fit(poly.fit_transform(x_train),y_train.result)
preds.append(clf.predict_proba(poly.transform(x_test)))

In [24]:
# 0,3832731 - переобучение (cv работает неверно - 0.38179435619)
# с фичами деления
preds = []

clf = RandomForestClassifier(n_estimators=240,max_depth=7,min_samples_leaf=38)
clf.fit(poly.fit_transform(x_train),y_train.result)
preds.append(clf.predict_proba(poly.transform(x_test)))

In [25]:
pd.DataFrame(np.mean(preds,axis=0))[1].to_csv('result_' + str(datetime.datetime.now()) + '.csv',index=False)