In [33]:
import pickle
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime
from tqdm import tqdm
from scipy.stats import spearmanr, kendalltau
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder

warnings.filterwarnings("ignore")

In [34]:
%%time
with open("tournaments.pkl", "rb") as f:
    tournaments = pickle.load(f)
with open("results.pkl", "rb") as f:
    results = pickle.load(f)
with open("players.pkl", "rb") as f:
    players = pickle.load(f)

CPU times: user 39.9 s, sys: 1min 41s, total: 2min 21s
Wall time: 3min 3s


# 1

In [35]:
def msk(tournament):
    flg = True
    for team in tournament:
        if (len(team['teamMembers']) == 0) or (team.get('mask') is None):
            flg = False
    return flg

In [36]:
results_corrected  = [i for i in results.keys() if msk(results[i])]

In [37]:
train_tournaments_labels = []
test_tournaments_labels = []

for i in tournaments.keys():
    if i in results_corrected:
        if tournaments[i]['dateStart'][:4] == '2019':
            train_tournaments_labels.append(i)
        elif tournaments[i]['dateStart'][:4] == '2020':
            test_tournaments_labels.append(i)

In [38]:
len(train_tournaments_labels)

657

In [39]:
len(test_tournaments_labels)

386

In [40]:
data = {
    'tournament_id': [],
    'tournament_name': [],
    'team_id': [],
    'player_id': [],
    'player_name': [],
    'player_surname': [],
    'question_num': [],
    'question_result': []
}

def to_int(x):
    try:
        return int(x)
    except:
        return 0

for i in train_tournaments_labels:
    for team in results[i]:
        for player in team['teamMembers']:
            for question_num in range(len(team['mask'])):
                if team['mask'][question_num] != 'X':
                    data['tournament_id'].append(tournaments[i]['id'])
                    data['tournament_name'].append(tournaments[i]['name'])
                    data['team_id'].append(team['team']['id'])
                    data['player_id'].append(player['player']['id'])
                    data['player_name'].append(player['player']['name'])
                    data['player_surname'].append(player['player']['surname'])
                    data['question_num'].append(question_num)
                    data['question_result'].append(to_int(team['mask'][question_num]))
                
                
data_train = pd.DataFrame.from_dict(data)

# 2

In [41]:
data_train = data_train.merge((data_train.groupby('tournament_id')['team_id'].nunique()/data_train['team_id'].nunique()).to_frame('teams_qnt_rate').reset_index(), on='tournament_id', how='left')
data_train = data_train.merge(data_train.groupby('tournament_id')['question_result'].mean().to_frame('mean_tournament_answered').reset_index(), on='tournament_id', how='left')
data_train = data_train.merge(data_train.groupby(['tournament_id', 'question_num'])['question_result'].mean().to_frame('question_dificulty').reset_index(), on=['tournament_id', 'question_num'], how='left')
data_train = data_train.merge(data_train.groupby(['tournament_id', 'player_id'])['question_result'].mean().to_frame('player_strength').reset_index(), on=['tournament_id', 'player_id'], how='left')

In [42]:
A_train = (data_train['teams_qnt_rate'] + data_train['mean_tournament_answered'] + data_train['question_dificulty'] + data_train['player_strength'] + data_train['question_result'].mean()).values
A_train = A_train.reshape(-1, 1)
A_train = np.append(A_train, np.ones((A_train.shape[0], 1)), axis=1)

B_train = data_train['question_result'].values
B_train = B_train.reshape(-1, 1)

In [43]:
model = LogisticRegression(solver = 'lbfgs')
model.fit(X_train[:, 0].reshape(-1, 1), y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
def calc_rating(prediction, player_rating_table=None, rating_name='rating'):
    
    temp_player_rating = data_train['player_id'].to_frame().copy()
    temp_player_rating['probability'] = prediction
    temp_player_rating = temp_player_rating.groupby('player_id')['probability'].mean().to_frame(rating_name).reset_index()
    
    if player_rating_table is None:
        player_rating_table = temp_player_rating
    else:
        if rating_name in player_rating_table.columns:
            player_rating_table[rating_name] = temp_player_rating[rating_name]
        else:
            player_rating_table = player_rating_table.merge(temp_player_rating, how='left', on='player_id')
        
    return player_rating_table

# 3

In [45]:
data = {
    'tournament_id': [],
    'tournament_name': [],
    'team_id': [],
    'player_id': [],
    'position' : [],
}

for i in test_tournaments_labels:
    for team in results[i]:
        for player in team['teamMembers']:
            data['tournament_id'].append(tournaments[i]['id'])
            data['tournament_name'].append(tournaments[i]['name'])
            data['team_id'].append(team['team']['id'])
            data['player_id'].append(player['player']['id'])
            data['position'].append(int(team['position']))
                                
data_test = pd.DataFrame.from_dict(data)

In [46]:
def check_quality(test_df, rating_df, rating_name):
    
    test_df = test_df.merge(rating_df, on='player_id', how='left')
    test_df[rating_name] = test_df[rating_name].fillna(0)
    data_test_grouped = test_df.groupby(['tournament_id', 'team_id', 'position'])[rating_name].sum().to_frame().reset_index()

    spearmanr_list = []
    kendalltau_list = []

    for i in data_test_grouped['tournament_id'].drop_duplicates():
        temp = data_test_grouped[data_test_grouped['tournament_id']==i]
        if temp.shape[0] > 1:
            spearmanr_value, _ = spearmanr(temp['position'], temp[rating_name])
            kendalltau_value, _ = kendalltau(temp['position'], temp[rating_name])
            spearmanr_list.append(spearmanr_value)
            kendalltau_list.append(kendalltau_value)

    return np.mean(spearmanr_list), np.mean(kendalltau_list)

In [47]:
spearman, kendal = check_quality(test_df=data_test.copy(), 
                                 rating_df=player_rating.copy(), 
                                 rating_name='rating')
print('Корреляция Спирмана:', spearman)
print('Корреляция Кендала:', kendal)

Корреляция Спирмана: -0.5929472671744298
Корреляция Кендала: -0.4469306480998754


# 4

In [48]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def net_input(theta, x):
    return np.dot(x, theta)

def probability(theta, x):
    return sigmoid(net_input(theta, x))

def cost_function(theta, x, y):
    m = x.shape[0]
    total_cost = -(1 / m) * np.sum(
        y * np.log(probability(theta, x)) + (1 - y) * np.log(
            1 - probability(theta, x)))
    return total_cost

def gradient(theta, x, y):
    m = x.shape[0]
    return (1 / m) * np.dot(x.T, sigmoid(net_input(theta,   x)) - y)

def fit(x, y, theta):
    opt_weights = fmin_tnc(func=cost_function, x0=theta,
                  fprime=gradient,args=(x, y.flatten()))
    return opt_weights[0]

In [49]:
pi = 0.1 # p(z==1)
n_0 = len(data_train['question_result'])-sum(data_train['question_result'])
n_u = sum(data_train['question_result'])

In [50]:
N = 3
for step_number in range(N):
    
    if step_number == 0:
        z_new, coefs, player_rating = em_step(X_train=X_train.copy(), 
                                              y_train=y_train.copy(), 
                                              z_train=None,
                                              write_rating_to = player_rating,
                                              rating_name='rating_new'
                                             )
    else:
        z_new, coefs, player_rating = em_step(X_train=X_train.copy(), 
                                      y_train=y_train.copy(), 
                                      z_train=z_new.copy(),
                                      write_rating_to = player_rating,
                                      rating_name='rating_new'
                                     )

    
    spearman, kendal = check_quality(test_df=data_test.copy(), 
                                     rating_df=player_rating.copy(), 
                                     rating_name='rating_new')
    print('Шаг', step_number+1)
    print('Корреляция Спирмана:', spearman)
    print('Корреляция Кендала:', kendal)

Шаг 1
Корреляция Спирмана: -0.678335498462195
Корреляция Кендала: -0.5217955726725476

Шаг 2
Корреляция Спирмана: -0.6849846342261009
Корреляция Кендала: -0.528061876457905

Шаг 3
Корреляция Спирмана: -0.6852120172712344
Корреляция Кендала: -0.5285588821488211



# 5

In [51]:
temp_data_train = data_train.copy()
temp_data_train = temp_data_train.merge(player_rating[['player_id', 'rating_new']], how='left', on='player_id')
temp_data_train = temp_data_train.groupby(['tournament_id', 'tournament_name', 'team_id', 'question_num', 'question_result']).agg({'rating_new':'sum'}).reset_index()
temp_data_train = temp_data_train.groupby(['tournament_id', 'tournament_name']).agg({'rating_new':'mean'}).reset_index()

Самые лёгкие

In [52]:
temp_data_train.sort_values('rating_new', ascending=True)[['tournament_name', 'rating_new']].head(10)

Unnamed: 0,tournament_name,rating_new
619,One ring - async,0.128201
376,Чемпионат Таджикистана,0.136663
633,ДР Земцовского,0.168761
69,Парный асинхронный турнир ChGK is...,0.209183
557,Открытый Студенческий чемпионат Краснодарского...,0.218041
140,Зимник,0.227262
174,Зимние игры,0.237733
286,Чемпионат Туркменистана,0.238974
630,Открытый кубок МВУТ,0.245495
321,Чемпионат Кыргызстана,0.247279


Самые сложные

In [53]:
temp_data_train.sort_values('rating_new', ascending=False)[['tournament_name', 'rating_new']].head(10)

Unnamed: 0,tournament_name,rating_new
643,Чемпионат Санкт-Петербурга. Высшая лига,0.815263
541,Чемпионат Мира. Финал. Группа А,0.783616
538,Чемпионат Мира. Этап 3. Группа А,0.779232
534,Чемпионат Мира. Этап 2. Группа А,0.778732
531,Чемпионат Мира. Этап 1. Группа А,0.734404
452,Шестой киевский марафон. Асинхрон,0.728915
175,Чемпионат России,0.691498
535,Чемпионат Мира. Этап 2. Группа В,0.68615
542,Чемпионат Мира. Финал. Группа В,0.685547
539,Чемпионат Мира. Этап 3. Группа В,0.685066
