In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.utils import shuffle
from collections import defaultdict
from tqdm import tqdm

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [3]:
df_train.shape, df_test.shape

((3633005, 24), (908252, 23))

# current_next_prev

In [4]:
df_train.X1 = df_train.X1.astype(int)
df_test.X1 = df_test.X1.astype(int)
df_train.X2 = df_train.X2.astype(int)
df_test.X2 = df_test.X2.astype(int)
df_train.X3 = df_train.X3.astype(int)
df_test.X3 = df_test.X3.astype(int)
df_train.X4 = df_train.X4.astype(int)
df_test.X4 = df_test.X4.astype(int)
df_train.X21 = df_train.X21.astype(int)
df_test.X21 = df_test.X21.astype(int)

In [5]:
player_to_games_and_scores = defaultdict(lambda: defaultdict(list))
for ind, line in tqdm(df_train.append(df_test).iterrows()):  
    player_to_games_and_scores[line['X1']]['scores'].append(int(line['X2']))
    player_to_games_and_scores[line['X1']]['games'].append(int(line['X21']))
    player_to_games_and_scores[line['X3']]['scores'].append(int(line['X4']))
    player_to_games_and_scores[line['X3']]['games'].append(int(line['X21']))

4541257it [06:02, 12516.93it/s]


In [6]:
for player in tqdm(player_to_games_and_scores):
    scores = player_to_games_and_scores[player]['scores']
    games = player_to_games_and_scores[player]['games']
    scores = np.array([score for game, score in sorted(zip(games, scores))])
    games = np.array(sorted(games))
    player_to_games_and_scores[player]['scores'] = scores
    player_to_games_and_scores[player]['games'] = games

100%|██████████| 970949/970949 [00:09<00:00, 100918.41it/s]


In [7]:
player_and_game_to_next_score_mean = {}
player_and_game_to_prev_score_mean = {}
player_and_game_to_current_score_mean = {}

player_and_game_to_next_score_median = {}
player_and_game_to_prev_score_median = {}
player_and_game_to_current_score_median = {}

player_and_game_to_next_score_max = {}
player_and_game_to_prev_score_max = {}
player_and_game_to_current_score_max = {}


player_and_game_to_next_score_min = {}
player_and_game_to_prev_score_min = {}
player_and_game_to_current_score_min = {}

player_and_game_to_next_game = {}
player_and_game_to_prev_game = {}
player_and_game_to_current_game = {}
for player in tqdm(player_to_games_and_scores):
    for game in set(player_to_games_and_scores[player]['games']):
        games = player_to_games_and_scores[player]['games']
        scores = player_to_games_and_scores[player]['scores']
        #current
        current_scores = scores[games==game]
        player_and_game_to_current_score_mean[(player, game)] = current_scores.mean()
        player_and_game_to_current_score_median[(player, game)] = np.median(current_scores)
        player_and_game_to_current_score_max[(player, game)] = current_scores.max()
        player_and_game_to_current_score_min[(player, game)] = current_scores.min()
        player_and_game_to_current_game[(player, game)] = game
        #next
        next_games_ind = games > game
        if next_games_ind.sum() > 0:
            next_game = games[games > game][0]
            next_score = scores[games==next_game]
        else:
            # Можно попробовать заменить на get в след ячейках без усреднения
            next_game = game
            next_score = scores[games==game]
            
        player_and_game_to_next_score_mean[(player, game)] = next_score.mean()
        player_and_game_to_next_score_median[(player, game)] = np.median(next_score)
        player_and_game_to_next_score_max[(player, game)] = next_score.max()
        player_and_game_to_next_score_min[(player, game)] = next_score.min()    
    
        player_and_game_to_next_game[(player, game)] = next_game
        #prev
        prev_games_ind = games < game
        if prev_games_ind.sum() > 0:
            prev_game = games[games < game][-1]
            prev_score = scores[games==prev_game]
        else:
            # Можно попробовать заменить на get в след ячейках без усреднения
            prev_game = game
            prev_score = scores[games==game]
        player_and_game_to_prev_score_mean[(player, game)] = prev_score.mean()
        player_and_game_to_prev_score_median[(player, game)] = np.median(prev_score)
        player_and_game_to_prev_score_max[(player, game)] = prev_score.max()
        player_and_game_to_prev_score_min[(player, game)] = prev_score.min() 
        player_and_game_to_prev_game[(player, game)] = prev_game
        

100%|██████████| 970949/970949 [12:06<00:00, 1335.76it/s] 


In [8]:
X2_next_mean = []
X4_next_mean = []
X2_prev_mean = []
X4_prev_mean = []
X2_current_mean = []
X4_current_mean = []

X2_next_median = []
X4_next_median = []
X2_prev_median = []
X4_prev_median = []
X2_current_median = []
X4_current_median = []

X2_next_max = []
X4_next_max = []
X2_prev_max = []
X4_prev_max = []
X2_current_max = []
X4_current_max = []

X2_next_min = []
X4_next_min = []
X2_prev_min = []
X4_prev_min = []
X2_current_min = []
X4_current_min = []

to_next_game_1 = []
to_next_game_2 = []
to_prev_game_1 = []
to_prev_game_2 = []

for ind, line in tqdm(df_train.iterrows()):
    player_1 = line['X1']
    player_2 = line['X3']
    game = line['X21']
    
    to_next_game_1.append(player_and_game_to_next_game[(player_1, game)] - player_and_game_to_current_game[(player_1, game)])
    to_next_game_2.append(player_and_game_to_next_game[(player_2, game)] - player_and_game_to_current_game[(player_2, game)])
    
    to_prev_game_1.append(player_and_game_to_current_game[(player_1, game)] - player_and_game_to_prev_game[(player_1, game)])
    to_prev_game_2.append(player_and_game_to_current_game[(player_2, game)] - player_and_game_to_prev_game[(player_2, game)])
    
    
    X2_next_mean.append(player_and_game_to_next_score_mean[(player_1, game)])
    X4_next_mean.append(player_and_game_to_next_score_mean[(player_2, game)])
    X2_prev_mean.append(player_and_game_to_prev_score_mean[(player_1, game)])
    X4_prev_mean.append(player_and_game_to_prev_score_mean[(player_2, game)])
    X2_current_mean.append(player_and_game_to_current_score_mean[(player_1, game)])
    X4_current_mean.append(player_and_game_to_current_score_mean[(player_2, game)])
    
    X2_next_median.append(player_and_game_to_next_score_median[(player_1, game)])
    X4_next_median.append(player_and_game_to_next_score_median[(player_2, game)])
    X2_prev_median.append(player_and_game_to_prev_score_median[(player_1, game)])
    X4_prev_median.append(player_and_game_to_prev_score_median[(player_2, game)])
    X2_current_median.append(player_and_game_to_current_score_median[(player_1, game)])
    X4_current_median.append(player_and_game_to_current_score_median[(player_2, game)])
    
    X2_next_max.append(player_and_game_to_next_score_max[(player_1, game)])
    X4_next_max.append(player_and_game_to_next_score_max[(player_2, game)])
    X2_prev_max.append(player_and_game_to_prev_score_max[(player_1, game)])
    X4_prev_max.append(player_and_game_to_prev_score_max[(player_2, game)])
    X2_current_max.append(player_and_game_to_current_score_max[(player_1, game)])
    X4_current_max.append(player_and_game_to_current_score_max[(player_2, game)])
    
    X2_next_min.append(player_and_game_to_next_score_min[(player_1, game)])
    X4_next_min.append(player_and_game_to_next_score_min[(player_2, game)])
    X2_prev_min.append(player_and_game_to_prev_score_min[(player_1, game)])
    X4_prev_min.append(player_and_game_to_prev_score_min[(player_2, game)])
    X2_current_min.append(player_and_game_to_current_score_min[(player_1, game)])
    X4_current_min.append(player_and_game_to_current_score_min[(player_2, game)])
    
    

df_train['X2_next_mean'] = X2_next_mean
df_train['X4_next_mean'] = X4_next_mean
df_train['X2_prev_mean'] = X2_prev_mean
df_train['X4_prev_mean'] = X4_prev_mean
df_train['X2_current_mean'] = X2_current_mean
df_train['X4_current_mean'] = X4_current_mean

df_train['X2_next_median'] = X2_next_median
df_train['X4_next_median'] = X4_next_median
df_train['X2_prev_median'] = X2_prev_median
df_train['X4_prev_median'] = X4_prev_median
df_train['X2_current_median'] = X2_current_median
df_train['X4_current_median'] = X4_current_median


df_train['X2_next_max'] = X2_next_max
df_train['X4_next_max'] = X4_next_max
df_train['X2_prev_max'] = X2_prev_max
df_train['X4_prev_max'] = X4_prev_max
df_train['X2_current_max'] = X2_current_max
df_train['X4_current_max'] = X4_current_max

df_train['X2_next_min'] = X2_next_min
df_train['X4_next_min'] = X4_next_min
df_train['X2_prev_min'] = X2_prev_min
df_train['X4_prev_min'] = X4_prev_min
df_train['X2_current_min'] = X2_current_min
df_train['X4_current_min'] = X4_current_min


df_train['to_next_game_1'] = to_next_game_1
df_train['to_next_game_2'] = to_next_game_2
df_train['to_prev_game_1'] = to_prev_game_1
df_train['to_prev_game_2'] = to_prev_game_2


3633005it [06:33, 9237.76it/s] 


In [9]:
X2_next_mean = []
X4_next_mean = []
X2_prev_mean = []
X4_prev_mean = []
X2_current_mean = []
X4_current_mean = []

X2_next_median = []
X4_next_median = []
X2_prev_median = []
X4_prev_median = []
X2_current_median = []
X4_current_median = []

X2_next_max = []
X4_next_max = []
X2_prev_max = []
X4_prev_max = []
X2_current_max = []
X4_current_max = []

X2_next_min = []
X4_next_min = []
X2_prev_min = []
X4_prev_min = []
X2_current_min = []
X4_current_min = []

to_next_game_1 = []
to_next_game_2 = []
to_prev_game_1 = []
to_prev_game_2 = []

for ind, line in tqdm(df_test.iterrows()):
    player_1 = line['X1']
    player_2 = line['X3']
    game = line['X21']
    
    to_next_game_1.append(player_and_game_to_next_game[(player_1, game)] - player_and_game_to_current_game[(player_1, game)])
    to_next_game_2.append(player_and_game_to_next_game[(player_2, game)] - player_and_game_to_current_game[(player_2, game)])
    
    to_prev_game_1.append(player_and_game_to_current_game[(player_1, game)] - player_and_game_to_prev_game[(player_1, game)])
    to_prev_game_2.append(player_and_game_to_current_game[(player_2, game)] - player_and_game_to_prev_game[(player_2, game)])
    
    
    X2_next_mean.append(player_and_game_to_next_score_mean[(player_1, game)])
    X4_next_mean.append(player_and_game_to_next_score_mean[(player_2, game)])
    X2_prev_mean.append(player_and_game_to_prev_score_mean[(player_1, game)])
    X4_prev_mean.append(player_and_game_to_prev_score_mean[(player_2, game)])
    X2_current_mean.append(player_and_game_to_current_score_mean[(player_1, game)])
    X4_current_mean.append(player_and_game_to_current_score_mean[(player_2, game)])
    
    X2_next_median.append(player_and_game_to_next_score_median[(player_1, game)])
    X4_next_median.append(player_and_game_to_next_score_median[(player_2, game)])
    X2_prev_median.append(player_and_game_to_prev_score_median[(player_1, game)])
    X4_prev_median.append(player_and_game_to_prev_score_median[(player_2, game)])
    X2_current_median.append(player_and_game_to_current_score_median[(player_1, game)])
    X4_current_median.append(player_and_game_to_current_score_median[(player_2, game)])
    
    X2_next_max.append(player_and_game_to_next_score_max[(player_1, game)])
    X4_next_max.append(player_and_game_to_next_score_max[(player_2, game)])
    X2_prev_max.append(player_and_game_to_prev_score_max[(player_1, game)])
    X4_prev_max.append(player_and_game_to_prev_score_max[(player_2, game)])
    X2_current_max.append(player_and_game_to_current_score_max[(player_1, game)])
    X4_current_max.append(player_and_game_to_current_score_max[(player_2, game)])
    
    X2_next_min.append(player_and_game_to_next_score_min[(player_1, game)])
    X4_next_min.append(player_and_game_to_next_score_min[(player_2, game)])
    X2_prev_min.append(player_and_game_to_prev_score_min[(player_1, game)])
    X4_prev_min.append(player_and_game_to_prev_score_min[(player_2, game)])
    X2_current_min.append(player_and_game_to_current_score_min[(player_1, game)])
    X4_current_min.append(player_and_game_to_current_score_min[(player_2, game)])
    
    

df_test['X2_next_mean'] = X2_next_mean
df_test['X4_next_mean'] = X4_next_mean
df_test['X2_prev_mean'] = X2_prev_mean
df_test['X4_prev_mean'] = X4_prev_mean
df_test['X2_current_mean'] = X2_current_mean
df_test['X4_current_mean'] = X4_current_mean

df_test['X2_next_median'] = X2_next_median
df_test['X4_next_median'] = X4_next_median
df_test['X2_prev_median'] = X2_prev_median
df_test['X4_prev_median'] = X4_prev_median
df_test['X2_current_median'] = X2_current_median
df_test['X4_current_median'] = X4_current_median


df_test['X2_next_max'] = X2_next_max
df_test['X4_next_max'] = X4_next_max
df_test['X2_prev_max'] = X2_prev_max
df_test['X4_prev_max'] = X4_prev_max
df_test['X2_current_max'] = X2_current_max
df_test['X4_current_max'] = X4_current_max

df_test['X2_next_min'] = X2_next_min
df_test['X4_next_min'] = X4_next_min
df_test['X2_prev_min'] = X2_prev_min
df_test['X4_prev_min'] = X4_prev_min
df_test['X2_current_min'] = X2_current_min
df_test['X4_current_min'] = X4_current_min


df_test['to_next_game_1'] = to_next_game_1
df_test['to_next_game_2'] = to_next_game_2
df_test['to_prev_game_1'] = to_prev_game_1
df_test['to_prev_game_2'] = to_prev_game_2



908252it [01:45, 8605.99it/s]


In [10]:
df_train.to_csv('./data/train_result_mean_median_max_min_current_next_prev.csv',index=False)
df_test.to_csv('./data/test_result_mean_median_max_min_current_next_prev.csv',index=False)

# next

In [11]:
X = df_train[['X2','X4','X2_next_mean','X4_next_mean']]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [12]:
model = LogisticRegression(random_state=42)

In [13]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   22.0s remaining:   33.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   22.3s remaining:   14.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.2s finished


In [14]:
round(log_loss(y, y_pred), 5)

0.60008

# current

In [15]:
X = df_train[['X2_current_mean','X4_current_mean']]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [16]:
model = LogisticRegression(random_state=42)

In [17]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.5s remaining:   18.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   13.3s remaining:    8.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.7s finished


In [18]:
round(log_loss(y, y_pred), 5)

0.61316

# next current 

In [19]:
X = df_train[['X2_current_mean','X4_current_mean','X2_next_mean','X4_next_mean']]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [20]:
model = LogisticRegression(random_state=42)

In [21]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   21.8s remaining:   32.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   23.0s remaining:   15.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.9s finished


In [22]:
round(log_loss(y, y_pred), 5)

0.57541

# prev current 

In [23]:
X = df_train[['X2_current_mean','X4_current_mean','X2_prev_mean','X4_prev_mean']]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [24]:
model = LogisticRegression(random_state=42)

In [25]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   14.3s remaining:   21.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   14.6s remaining:    9.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.9s finished


In [26]:
round(log_loss(y, y_pred), 5)

0.61439

# prev current next

In [27]:
X = df_train[['X2_current_mean','X4_current_mean','X2_prev_mean','X4_prev_mean','X2_next_mean','X4_next_mean']]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [28]:
model = LogisticRegression(random_state=42)

In [29]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   31.9s remaining:   47.8s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   32.1s remaining:   21.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   36.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   36.6s finished


In [30]:
round(log_loss(y, y_pred), 5)

0.57466

# prev current next X2 x4

In [31]:
X = df_train[['X2_current_mean','X4_current_mean','X2_prev_mean','X4_prev_mean','X2_next_mean','X4_next_mean', 'X2','X4']]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [32]:
model = LogisticRegression(random_state=42)

In [33]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   22.0s remaining:   33.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   27.8s remaining:   18.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.5s finished


In [34]:
round(log_loss(y, y_pred), 5)

0.55814

# diffs

In [35]:
df_train['current_diff'] = df_train['X4_current_mean'] - df_train['X2_current_mean']
df_train['next_diff_X4'] = df_train['X4_next_mean'] - df_train['X4_current_mean']
df_train['next_diff_X2'] = df_train['X2_next_mean'] - df_train['X2_current_mean']
df_train['prev_diff_X4'] = df_train['X4_current_mean'] - df_train['X4_prev_mean']
df_train['prev_diff_X2'] = df_train['X2_current_mean'] - df_train['X2_prev_mean']

In [36]:
X = df_train[['current_diff','next_diff_X4','next_diff_X2','prev_diff_X4','prev_diff_X2']]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [37]:
model = LogisticRegression(random_state=42)

In [38]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   20.7s remaining:   31.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   20.8s remaining:   13.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.1s finished


In [39]:
round(log_loss(y, y_pred), 5)

0.57421

### only next diffs

In [40]:
X = df_train[['next_diff_X2','next_diff_X4']]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [41]:
model = LogisticRegression(random_state=42)

In [42]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    8.6s remaining:   12.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    8.8s remaining:    5.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.1s finished


In [43]:
round(log_loss(y, y_pred), 5)

0.62687

# all

In [44]:
features = ['X2_next_mean', 'X4_next_mean', 'X2_prev_mean',
       'X4_prev_mean', 'X2_current_mean', 'X4_current_mean', 'X2_next_median',
       'X4_next_median', 'X2_prev_median', 'X4_prev_median',
       'X2_current_median', 'X4_current_median', 'X2_next_max', 'X4_next_max',
       'X2_prev_max', 'X4_prev_max', 'X2_current_max', 'X4_current_max',
       'X2_next_min', 'X4_next_min', 'X2_prev_min', 'X4_prev_min',
       'X2_current_min', 'X4_current_min', 'to_next_game_1', 'to_next_game_2',
       'to_prev_game_1', 'to_prev_game_2', 'X2', 'X4']

In [45]:
X = df_train[features]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [46]:
model = LogisticRegression(random_state=42)

In [47]:
y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.9min remaining:  2.8min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.9min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.9min finished


In [48]:
round(log_loss(y, y_pred), 5)

0.55667

# add 1/dist

In [49]:
df_train['1/to_next_game_1'] = df_train['to_next_game_1'].apply(lambda x: x if x==0 else 1/x)
df_train['1/to_next_game_2'] = df_train['to_next_game_2'].apply(lambda x: x if x==0 else 1/x)
df_train['1/to_prev_game_1'] = df_train['to_prev_game_1'].apply(lambda x: x if x==0 else 1/x)
df_train['1/to_prev_game_2'] = df_train['to_prev_game_2'].apply(lambda x: x if x==0 else 1/x)

In [50]:
features = ['X2_next_mean', 'X4_next_mean', 'X2_prev_mean',
       'X4_prev_mean', 'X2_current_mean', 'X4_current_mean', 'X2_next_median',
       'X4_next_median', 'X2_prev_median', 'X4_prev_median',
       'X2_current_median', 'X4_current_median', 'X2_next_max', 'X4_next_max',
       'X2_prev_max', 'X4_prev_max', 'X2_current_max', 'X4_current_max',
       'X2_next_min', 'X4_next_min', 'X2_prev_min', 'X4_prev_min',
       'X2_current_min', 'X4_current_min', 'to_next_game_1', 'to_next_game_2',
       'to_prev_game_1', 'to_prev_game_2', 'X2', 'X4',
           '1/to_next_game_1',
           '1/to_next_game_2',
           '1/to_prev_game_1',
           '1/to_prev_game_2']

In [51]:
X = df_train[features]
y = df_train['target']
X, y = shuffle(X, y, random_state=42)

In [52]:
model = LogisticRegression(random_state=42)

In [53]:
y_pred = cross_val_predict(model, X, y, cv=3, n_jobs=-1, verbose=45, method='predict_proba')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.0min finished


In [54]:
round(log_loss(y, y_pred), 5)

0.55605

In [55]:
y_pred = [0.38]*len(y)
round(log_loss(y, y_pred), 5)

0.6641

In [56]:
y.value_counts(normalize=True)

0    0.619917
1    0.380083
Name: target, dtype: float64