## part 1 - 3

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import os
from collections import defaultdict

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from scipy.sparse import coo_array
from scipy import stats
from copy import deepcopy

In [2]:
players = pickle.load(open('./chgk/players.pkl', 'rb'))

In [3]:
tournaments = pickle.load(open('./chgk/tournaments.pkl', 'rb'))

In [4]:
results = pickle.load(open('./chgk/results.pkl', 'rb'))

In [5]:
# фильтрация данных по году и наличию повопросных результатов

train_tournaments = []
test_tournaments = []

tournament_teams = defaultdict(list)

for i, tournament in tournaments.items():
    year = tournament['dateStart'][:4]
    if year in ['2019', '2020']:
        questionQty = sum(tournament['questionQty'].values())
        teams = [team for team in results[i] if 'mask' in team 
                 and team['mask'] is not None 
                 and len(team['mask']) == questionQty
                 and re.match("[0-9]+$", team['mask'])]
        
        if teams:
            tournament_teams[i] = teams
            
            if year == '2019':
                train_tournaments.append(i)
            else:
                test_tournaments.append(i)

In [58]:
# составление тренировочного сета игроков

players_train = set()

for i in train_tournaments:
    for team in tournament_teams[i]:
        for player in team['teamMembers']:
            players_train.add(player['player']['id'])

In [7]:
player_to_idx = {player: idx for idx, player in enumerate(players_train)}

In [86]:
players_encoded = []
questions_encoded = []
answers = []
players_rating = {}
touraments_questions = {}

question_counter = 0

for i in train_tournaments:
    start = question_counter + len(players_train)
    end = start + len(tournament_teams[i][0]['mask'])
    touraments_questions[i] = [i for i in range(start, end)]
    
    for team in tournament_teams[i]:
        team_answers = list(map(int, team['mask']))
        
        for j, answer in enumerate(team_answers):
            
            for player in team['teamMembers']:
                players_encoded.append(player_to_idx[player['player']['id']])
                players_rating[player['player']['id']] = player['rating']
                questions_encoded.append(question_counter + j + len(players_train))
                answers.append(answer)
                
    question_counter += len(team_answers)

In [87]:
data_players = [1] * len(players_encoded)
row_players = [i for i in range(len(players_encoded))]
column_players = players_encoded
shape_players = len(row_players), len(players_train) + question_counter

data_questions = [1] * len(questions_encoded)
row_questions = [i for i in range(len(questions_encoded))]
column_questions = questions_encoded
shape_questions = len(row_questions), len(players_train) + question_counter

In [88]:
# матрица, в которой строки - игроки, столбцы - игроки и вопросы. 
# При обучении выучивается коэффициент как каждого игрока, так и каждого вопроса

X_players = coo_array((data_players, (row_players, column_players)), shape=shape_players)
X_questions = coo_array((data_questions, (row_questions, column_questions)), shape=shape_questions)
X = X_players + X_questions

del X_players, X_questions

In [89]:
y = np.array(answers)

In [90]:
model = LogisticRegression(solver='saga', n_jobs=-1)
model.fit(X, y)

In [13]:
players_ranks = model.coef_[:len(players_train)][0]

In [14]:
players_ranking_pred = []

for player in players_train:
    players_ranking_pred.append({
        'player_id': player,
        'rank_pred': players_ranks[player_to_idx[player]],
        'rank_true': players_rating[player],
        'name': players[player]['name'],
        'patronymic': players[player]['patronymic'],
        'surname': players[player]['surname']
    })

In [60]:
players_ranking_pred_df = pd.DataFrame(players_ranking_pred)

In [16]:
players_ranking_pred_df = players_ranking_pred_df.sort_values(by='rank_true', ascending=False)

In [17]:
players_ranking_pred_df.loc[:, 'rank_true'] = np.arange(1, len(players_ranking_pred_df) + 1)

In [18]:
players_ranking_pred_df = players_ranking_pred_df.sort_values(by='rank_pred', ascending=False)

In [19]:
players_ranking_pred_df.loc[:, 'rank_pred'] = np.arange(1, len(players_ranking_pred_df) + 1)

  players_ranking_pred_df.loc[:, 'rank_pred'] = np.arange(1, len(players_ranking_pred_df) + 1)


In [20]:
players_ranking_pred_df.head(10)

Unnamed: 0,player_id,rank_pred,rank_true,name,patronymic,surname
9361,27403,1,3,Максим,Михайлович,Руссо
1409,4270,2,6,Александра,Владимировна,Брутер
9874,28751,3,5,Иван,Николаевич,Семушин
9541,27822,4,2,Михаил,Владимирович,Савченков
10380,30270,5,4,Сергей,Леонидович,Спешков
10340,30152,6,1,Артём,Сергеевич,Сорожкин
6000,18036,7,28,Михаил,Ильич,Левандовский
6917,20691,8,66,Станислав,Григорьевич,Мереминский
49576,87637,9,183,Антон,Владимирович,Саксонов
7681,22799,10,17,Сергей,Игоревич,Николенко


In [61]:
def predict_ranking(model, team):
    players_encoded = [player_to_idx[player['player']['id']] for player in team['teamMembers']]

    data = [1] * len(players_encoded)
    rows = [i for i in range(len(players_encoded))]
    columns = players_encoded
    shape = len(players_encoded), len(model.coef_[0])

    X = coo_array((data, (rows, columns)), shape=shape)

    #  вероятность ответить правильно: 1 - вероятность того, что все игроки ответили неверно           

    return 1 - model.predict_proba(X)[:, 0].prod()

In [68]:
test_tournaments_rating = []
test_tournaments_rating_pred = []

for i in test_tournaments:
    tournament_rating = []
    tournament_rating_pred = []
    
    for team in tournament_teams[i]:
        team_filtered = deepcopy(team)
        team_filtered['teamMembers'] = []
        
        for player in team['teamMembers']:
            if player['player']['id'] in players_train:
                team_filtered['teamMembers'].append(player)
                
        if team_filtered['teamMembers']:
            rating_true = sum(list(map(int, team_filtered['mask']))) 
            rating_pred = predict_ranking(model, team_filtered)
            
            tournament_rating.append(rating_true)
            tournament_rating_pred.append(rating_pred)
            
    if tournament_rating:
        test_tournaments_rating.append(tournament_rating)
        test_tournaments_rating_pred.append(tournament_rating_pred)

In [69]:
def rank_correlation(test_tournaments_rating, test_tournaments_rating_pred):
    spearman = [stats.spearmanr(true, pred).correlation for true, pred in zip(test_tournaments_rating, test_tournaments_rating_pred)]
    kendall = [stats.kendalltau(true, pred).correlation for true, pred in zip(test_tournaments_rating, test_tournaments_rating_pred)]
    return np.array(spearman).mean(), np.array(kendall).mean()

In [70]:
rank_correlation(test_tournaments_rating, test_tournaments_rating_pred)

(0.7920070096960519, 0.6353767154753027)

## part 5

In [96]:
questions_ranking = model.coef_[0]

In [97]:
tournaments_ranks = {'name': [], 'score': []}

for i in train_tournaments:
    ranking = questions_ranking[touraments_questions[i]]
    
    tournaments_ranks['name'].append(tournaments[i]['name'])
    tournaments_ranks['score'].append(ranking.sum() / len(ranking))

In [103]:
top_questions = pd.DataFrame(tournaments_ranks)

In [105]:
# топ сложных турниров
top_questions.sort_values(by='score').head(15) 

Unnamed: 0,name,score
603,Чемпионат Санкт-Петербурга. Первая лига,-4.437388
488,Угрюмый Ёрш,-2.303805
35,Первенство правого полушария,-2.076394
579,Воображаемый музей,-1.984692
254,Записки охотника,-1.802301
12,Кубок городов,-1.732424
22,Ускользающая сова,-1.693091
342,Знание – Сила VI,-1.682342
200,Чемпионат Минска. Лига А. Тур четвёртый,-1.567977
157,Чемпионат России,-1.567505


In [106]:
# топ легких турниров
top_questions.sort_values(by='score').tail(15) 

Unnamed: 0,name,score
265,Межфакультетский кубок МГУ. Отбор №4,1.709038
542,Второй тематический турнир имени Джоуи Триббиани,1.733045
545,Малый кубок Физтеха,1.767997
351,(а)Синхрон-lite. Лига старта. Эпизод X,1.855258
510,Школьная лига. III тур.,1.862519
613,Школьная лига,1.87239
62,(а)Синхрон-lite. Лига старта. Эпизод VI,1.904319
493,Школьная лига. I тур.,1.916568
8,(а)Синхрон-lite. Лига старта. Эпизод IV,1.947398
150,Студенческий чемпионат Калининградской области,1.976759


Топы турниров соответствуют интуиции, самые сложные - чемпионаты мира, стран и крупных городов. самые легкие - синхроны и школьные лиги 

## part 4

In [114]:
from scipy.special import logit

In [128]:
eps = 1e-5
init_y = model.predict_proba(X)[:, 1] * y
init_y = np.clip(init_y, eps, 1-eps)