## 傾向スコアによる後攻の場合の勝率の算出
---

In [1]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

  from pandas.core import datetools


### パラメータ

In [2]:
# 入力ファイル
input_dir_path = u"../../Study/data/"
input_file_name = u"result_score_data.csv"
input_file_path = os.path.join(input_dir_path, input_file_name)

### データの読み込み

In [3]:
result_score_df = pd.read_csv(input_file_path)
result_score_df

Unnamed: 0,draw,sheet,team,hammer,end1,end2,end3,end4,end5,end6,...,end8,end9,end10,extra1,extra2,score(total),result,year,gender,game_ID
0,Draw #1,A,Sweden,1,0,0,0,0,0,0,...,1,0,X,,,1,-1,2010,Men,1
1,Draw #1,A,France,0,0,1,1,0,2,0,...,0,0,X,,,5,1,2010,Men,1
2,Draw #1,B,China,0,0,0,0,0,0,1,...,0,2,0,,,4,-1,2010,Men,2
3,Draw #1,B,Germany,1,0,0,1,1,1,0,...,1,0,1,,,5,1,2010,Men,2
4,Draw #1,C,Japan,0,0,1,0,1,1,0,...,0,0,1,0.0,,6,-1,2010,Men,3
5,Draw #1,C,Italy,1,1,0,1,0,0,1,...,2,1,0,1.0,,7,1,2010,Men,3
6,Draw #1,D,Norway,0,0,0,0,0,2,0,...,1,1,0,,,4,-1,2010,Men,4
7,Draw #1,D,Scotland,1,0,0,0,2,0,0,...,0,0,2,,,5,1,2010,Men,4
8,Draw #2,A,Italy,0,0,2,0,2,0,1,...,0,0,1,0.0,,6,-1,2010,Men,5
9,Draw #2,A,China,1,1,0,2,0,0,0,...,0,1,0,1.0,,7,1,2010,Men,5


### チームの強さとして、各大会でのチーム別の勝率を算出（ラウンドロビンのみ）

In [4]:
round_robin_result_df = result_score_df[result_score_df["draw"].str.contains("Draw")]

# 勝ち数
win_game_num_df = round_robin_result_df[round_robin_result_df["result"]==1].groupby(["year", "gender", "team"], as_index=False)["draw"].count()
win_game_num_df.rename(columns={"draw":"win"}, inplace=True)
# 試合数
game_num_df = round_robin_result_df.groupby(["year", "gender", "team"], as_index=False)["draw"].count()
game_num_df.rename(columns={"draw":"total"}, inplace=True)

team_df = round_robin_result_df[["year", "gender", "team"]].drop_duplicates()
team_df = pd.merge(left=team_df, right=game_num_df, on=["year", "gender", "team"], how="left")
team_df = pd.merge(left=team_df, right=win_game_num_df, on=["year", "gender", "team"], how="left")

# データがない場合は０
team_df.fillna(value=0, inplace=True)

# チームの強さ（予選での勝率）
team_df["strength"] = team_df["win"] / team_df["total"]

# 不要なカラムは除く
team_df = team_df[["year", "gender", "team", "strength"]]

team_df

Unnamed: 0,year,gender,team,strength
0,2010,Men,Sweden,0.363636
1,2010,Men,France,0.272727
2,2010,Men,China,0.272727
3,2010,Men,Germany,0.454545
4,2010,Men,Japan,0.090909
5,2010,Men,Italy,0.272727
6,2010,Men,Norway,0.909091
7,2010,Men,Scotland,0.727273
8,2010,Men,Canada,0.818182
9,2010,Men,United States of America,0.727273


### 試合毎の形式に変換

In [5]:
# 先攻・後攻がわからないゲームは除く
print("--- data size ---")
print("data size: (%d, %d)" % result_score_df.shape)
filtered_result_score_df = result_score_df[result_score_df["hammer"] != -1]
print("data size(filetered): (%d, %d)" % filtered_result_score_df.shape)

# 試合毎に変換
hammer_team_result_score_df = filtered_result_score_df[filtered_result_score_df["hammer"]==1][["year", "gender", "game_ID", "team", "result"]]
hammer_team_result_score_df = pd.merge(left=hammer_team_result_score_df, right=team_df, on=["year", "gender", "team"])
hammer_team_result_score_df.rename(columns={"team":"hammer_team", "result":"hammer_result", "strength":"hammer_strength"}, inplace=True)
not_hammer_team_result_score_df = filtered_result_score_df[filtered_result_score_df["hammer"]==0][["year", "gender", "game_ID", "team", "result"]]
not_hammer_team_result_score_df = pd.merge(left=not_hammer_team_result_score_df, right=team_df, on=["year", "gender", "team"])
not_hammer_team_result_score_df.rename(columns={"team":"not_hammer_team", "result":"not_hammer_result", "strength":"not_hammer_strength"}, inplace=True)

game_result_df = pd.merge(left=hammer_team_result_score_df, right=not_hammer_team_result_score_df, on=["year", "gender", "game_ID"])
game_result_df

--- data size ---
data size: (3746, 21)
data size(filetered): (3728, 21)


Unnamed: 0,year,gender,game_ID,hammer_team,hammer_result,hammer_strength,not_hammer_team,not_hammer_result,not_hammer_strength
0,2010,Men,1,Sweden,-1,0.363636,France,1,0.272727
1,2010,Men,18,Sweden,-1,0.363636,Norway,1,0.909091
2,2010,Men,27,Sweden,-1,0.363636,Canada,1,0.818182
3,2010,Men,32,Sweden,-1,0.363636,Denmark,1,0.636364
4,2010,Men,38,Sweden,1,0.363636,Japan,-1,0.090909
5,2010,Men,60,Sweden,1,0.363636,Germany,-1,0.454545
6,2010,Men,2,Germany,1,0.454545,China,-1,0.272727
7,2010,Men,8,Germany,1,0.454545,Japan,-1,0.090909
8,2010,Men,12,Germany,-1,0.454545,Italy,1,0.272727
9,2010,Men,22,Germany,1,0.454545,France,-1,0.272727


### 先攻チーム・後攻チームの勝率

In [6]:
# 全試合数
game_num = game_result_df.shape[0]

# 勝ち試合数：先攻チーム
not_hammer_win_game_num = game_result_df[game_result_df["not_hammer_result"]==1].shape[0]
# 勝ち試合数：後攻チーム
hammer_win_game_num = game_result_df[game_result_df["hammer_result"]==1].shape[0]

print(u"先攻チーム勝率： %f" % (not_hammer_win_game_num / game_num))
print(u"後攻チーム勝率： %f" % (hammer_win_game_num / game_num))

先攻チーム勝率： 0.431330
後攻チーム勝率： 0.568670


### 先攻チーム・後攻チームの強さの平均

In [7]:
print("先攻チーム強さ（平均）： %f" % game_result_df["not_hammer_strength"].mean())
print("後攻チーム強さ（平均）： %f" % game_result_df["hammer_strength"].mean())

先攻チーム強さ（平均）： 0.502130
後攻チーム強さ（平均）： 0.534243


### 傾向スコアの算出

In [8]:
# 傾向スコア算出用に変換
column_list = ["result", "team", "strength", "opponent_team", "opponent_strength", "hammer"]
# 先攻チーム観点での整理
not_hammer_df = game_result_df[["not_hammer_result", "not_hammer_team", "not_hammer_strength", "hammer_team", "hammer_strength"]].copy()
not_hammer_df["hammer"] = [0] * not_hammer_df.shape[0]
not_hammer_df.columns = column_list
# 後攻チーム観点での整理
hammer_df = game_result_df[["hammer_result", "hammer_team", "hammer_strength", "not_hammer_team", "not_hammer_strength"]].copy()
hammer_df["hammer"] = [1] * not_hammer_df.shape[0]
hammer_df.columns = column_list

calc_target_df = pd.concat([not_hammer_df, hammer_df])
calc_target_df["result"] = calc_target_df["result"].apply(lambda x: 0 if x == -1 else 1)
calc_target_df

Unnamed: 0,result,team,strength,opponent_team,opponent_strength,hammer
0,1,France,0.272727,Sweden,0.363636,0
1,1,Norway,0.909091,Sweden,0.363636,0
2,1,Canada,0.818182,Sweden,0.363636,0
3,1,Denmark,0.636364,Sweden,0.363636,0
4,0,Japan,0.090909,Sweden,0.363636,0
5,0,Germany,0.454545,Sweden,0.363636,0
6,0,China,0.272727,Germany,0.454545,0
7,0,Japan,0.090909,Germany,0.454545,0
8,1,Italy,0.272727,Germany,0.454545,0
9,0,France,0.272727,Germany,0.454545,0


In [9]:
# 傾向スコアの算出
y = calc_target_df["hammer"]
X = calc_target_df[["strength", "opponent_strength"]]
lr_model = sm.Logit(y, X)
lr_result = lr_model.fit()
lr_result.summary()

Optimization terminated successfully.
         Current function value: 0.688536
         Iterations 4


0,1,2,3
Dep. Variable:,hammer,No. Observations:,3728.0
Model:,Logit,Df Residuals:,3726.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 01 Jan 2018",Pseudo R-squ.:,0.006652
Time:,19:23:27,Log-Likelihood:,-2566.9
converged:,True,LL-Null:,-2584.1
,,LLR p-value:,4.539e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
strength,0.5766,0.103,5.574,0.000,0.374,0.779
opponent_strength,-0.5766,0.103,-5.574,0.000,-0.779,-0.374


In [10]:
calc_target_df["PS"] = lr_result.predict(X)
calc_target_df

Unnamed: 0,result,team,strength,opponent_team,opponent_strength,hammer,PS
0,1,France,0.272727,Sweden,0.363636,0,0.486898
1,1,Norway,0.909091,Sweden,0.363636,0,0.577987
2,1,Canada,0.818182,Sweden,0.363636,0,0.565152
3,1,Denmark,0.636364,Sweden,0.363636,0,0.539234
4,0,Japan,0.090909,Sweden,0.363636,0,0.460766
5,0,Germany,0.454545,Sweden,0.363636,0,0.513102
6,0,China,0.272727,Germany,0.454545,0,0.473814
7,0,Japan,0.090909,Germany,0.454545,0,0.447772
8,1,Italy,0.272727,Germany,0.454545,0,0.473814
9,0,France,0.272727,Germany,0.454545,0,0.473814


### 傾向スコアと真値との比較

In [11]:
tmp_df = calc_target_df[["hammer", "PS"]].copy()
tmp_df["pred_hammer"] = tmp_df["PS"].apply(lambda x: 1 if x > 0.5 else 0)
tmp_df.pivot_table(index="hammer", columns="pred_hammer", aggfunc="count")

Unnamed: 0_level_0,PS,PS
pred_hammer,0,1
hammer,Unnamed: 1_level_2,Unnamed: 2_level_2
0,1130,734
1,966,898


### 傾向スコアマッチング

In [15]:
y = calc_target_df["result"]
z1 = calc_target_df["hammer"]
ps = calc_target_df["PS"]
table = pd.concat([ps, z1, y], axis=1)

# 0.05刻みでマッチング
interval = np.arange(0, 1.05, 0.05)

match_list = []
for i in range(0, len(interval)-1):
    # 先攻（非介入群）
    not_hammer_table = table[(table['hammer']==0) & (interval[i] <= table['PS']) & (table['PS'] < interval[i+1])]
    # 後攻（介入群）
    hammer_table = table[(table['hammer']==1) & (interval[i] <= table['PS']) & (table['PS'] < interval[i+1])]
    
    # 先攻・後攻ともデータがある場合
    if (len(not_hammer_table) > 0) & (len(hammer_table) > 0):
        # 介入効果を算出
        match_list.append(hammer_table['result'].mean() - not_hammer_table['result'].mean())

print("後攻をとった場合に勝率に与える効果：")
print(np.mean(match_list))

後攻をとった場合に勝率に与える効果：
0.0433670012741


### IPW推定量の算出

In [16]:
y = calc_target_df["result"]
z1 = calc_target_df["hammer"]
ps = calc_target_df["PS"]

ipwe1 = sum((z1 * y) / ps) / sum(z1 / ps)
ipwe0 = sum(((1 - z1) * y) / (1 - ps)) / sum((1 - z1) / (1 - ps))
ipwe1 - ipwe0

0.078584030354893919

### 元データを使って層別解析

In [33]:
# 0.05刻み
interval1 = np.arange(0, 1.05, 0.05)
interval2 = np.arange(0, 1.05, 0.05)

match_list = []
for i1 in range(0, len(interval1)-1):
    for i2 in range(0, len(interval2)-1):
        # 先攻
        not_hammer_df = calc_target_df[(calc_target_df["hammer"]==0)
                                        & (interval1[i1] <= calc_target_df["strength"])
                                        & (calc_target_df["strength"] < interval1[i1+1])
                                        & (interval2[i2] <= calc_target_df["opponent_strength"])
                                        & (calc_target_df["opponent_strength"] < interval2[i2+1])]
        # 後攻
        hammer_df = calc_target_df[(calc_target_df["hammer"]==1)
                                    & (interval1[i1] <= calc_target_df["strength"])
                                    & (calc_target_df["strength"] < interval1[i1+1])
                                    & (interval2[i2] <= calc_target_df["opponent_strength"])
                                    & (calc_target_df["opponent_strength"] < interval2[i2+1])]
        # 先攻・後攻ともデータがある場合
        if (not_hammer_df.shape[0] > 0) & (hammer_df.shape[0] > 0):
            # 介入効果を算出
            match_list.append(hammer_df['result'].mean() - not_hammer_df['result'].mean())

print("後攻をとった場合に勝率に与える効果：")
print(np.mean(match_list))

後攻をとった場合に勝率に与える効果：
0.0534425086696


### ロジスティック回帰

In [37]:
y = calc_target_df["result"]
X = calc_target_df[["hammer", "strength", "opponent_strength"]]

# モデル作成
lr_model = sm.Logit(y, X)
lr_result = lr_model.fit()
lr_result.summary()

Optimization terminated successfully.
         Current function value: 0.457195
         Iterations 7


0,1,2,3
Dep. Variable:,result,No. Observations:,3728.0
Model:,Logit,Df Residuals:,3725.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 01 Jan 2018",Pseudo R-squ.:,0.3404
Time:,23:08:15,Log-Likelihood:,-1704.4
converged:,True,LL-Null:,-2584.1
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
hammer,0.4398,0.081,5.435,0.000,0.281,0.598
strength,5.6344,0.200,28.134,0.000,5.242,6.027
opponent_strength,-6.0077,0.202,-29.792,0.000,-6.403,-5.612
