In [1]:
import sqlite3
import pandas as pd
import matplotlib as rc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [2]:
#------------------------------------------------------------------------------
# accept a dataframe, remove outliers, return cleaned data in a new dataframe
# see http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm
#------------------------------------------------------------------------------
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

# set style for seaborn
sns.set_style("whitegrid")
sns.set_context("paper")
rc.use("pgf")
rc.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})
#plt.rc('text', usetex=True)
#plt.rc('font', family='serif')


In [3]:
# connect to database
connection = sqlite3.connect("../output/db/bulk_analysis_2019-11-08_1000ms.db")


In [4]:
# load games and moves into dataframes 
df_game_dirty = pd.read_sql("SELECT * FROM game", connection)
df_move_dirty = pd.read_sql("SELECT * FROM move", connection)

# remove games that are either too short or too long
df_game = remove_outlier(df_game_dirty, "length")
df_move = df_move_dirty[df_move_dirty["game_id"].isin(df_game["id"])]
df_move.head()


Unnamed: 0,id,fullmove_number,ply_number,turn,san,lan,score,score_change,score_change_category,move_count,...,guarded_pieces_centipawn_all,attacked_guarded_pieces_centipawn_all,unopposed_threats_centipawn_all,threats_centipawn_all,attack_defense_relation1,attack_defense_relation2,material,pawn_ending,rook_ending,game_id
0,1,1,1,1,d4,d2-d4,-10,10,0.2,20,...,5800,0,0,0,0,-600,0,0,0,1
1,2,1,2,0,g6,g7-g6,118,128,2.56,20,...,5800,0,0,0,0,-500,0,0,0,1
2,3,2,3,1,c4,c2-c4,37,81,1.62,28,...,5700,0,0,0,0,-1500,0,0,0,1
3,4,2,4,0,Bg7,Bf8-g7,106,69,1.38,21,...,5900,100,0,0,600,-2100,0,0,0,1
4,5,3,5,1,Nf3,Ng1-f3,43,63,1.26,30,...,5900,100,0,0,900,-1300,0,0,0,1


In [5]:
# box_dirty = sns.boxplot(y=df_move["best_move_score_diff"])

In [6]:
df_move_mistake_clean = remove_outlier(df_move, "best_move_score_diff")
#
plt.tight_layout()
f, axes = plt.subplots(1, 2, figsize=(8, 1.5))
box_dirty = sns.boxplot(x=df_move["best_move_score_diff"], ax=axes[0])
axes[0].set_title("Best move score difference, including outliers")
axes[0].set_xlabel("centipawn")
box_clean = sns.boxplot(x=df_move_mistake_clean["best_move_score_diff"], ax=axes[1])
axes[1].set_title("Best move score difference, excluding outliers")
axes[1].set_xlabel("centipawn")


Text(0.5, 0, 'centipawn')

In [7]:
f.tight_layout()
f.savefig("boxplot_mistakes.pgf")
f.savefig("boxplot_mistakes.pdf")


In [8]:
#scatter_mistake = sns.scatterplot(x="ply_number", y="best_move_score_diff", data=df_move)

In [9]:
#scatter_mistake = sns.scatterplot(x="ply_number", y="best_move_score_diff", data=df_move_mistake_clean)

In [10]:

#sns.distplot(df_move["best_move_score_diff"])


In [11]:
#sns.distplot(df_move_mistake_clean["best_move_score_diff"])

In [12]:
#sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_mistake_clean, kind="kde");

In [13]:

df_move_2040_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 20 & length <= 40')["id"])]
df_move_2040 = remove_outlier(df_move_2040_dirty, "best_move_score_diff")
df_move_4060_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 40 & length <= 60')["id"])]
df_move_4060 = remove_outlier(df_move_4060_dirty, "best_move_score_diff")
df_move_6080_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 60 & length <= 80')["id"])]
df_move_6080 = remove_outlier(df_move_6080_dirty, "best_move_score_diff")
df_move_80100_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 80 & length <= 100')["id"])]
df_move_80100 = remove_outlier(df_move_80100_dirty, "best_move_score_diff")
df_move_100120_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 100 & length <= 120')["id"])]
df_move_100120 = remove_outlier(df_move_100120_dirty, "best_move_score_diff")

In [14]:
#sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_100120, kind="kde");

In [15]:

f2, axes = plt.subplots(2, 2, figsize=(8, 8))
sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_2040, kind="kde", ax=axes[0][0]);
axes[0][0].set_title("Kernel density for mistakes in games with length 20 - 40")
axes[0][0].set_xlabel("ply")
axes[0][0].set_ylabel("centipawn")
sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_4060, kind="kde", ax=axes[0][1]);
axes[0][1].set_title("Kernel density for mistakes in games with length 40 - 60")
axes[0][1].set_xlabel("ply")
axes[0][1].set_ylabel("centipawn")
sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_6080, kind="kde", ax=axes[1][0]);
axes[1][0].set_title("Kernel density for mistakes in games with length 60 - 80")
axes[1][0].set_xlabel("ply")
axes[1][0].set_ylabel("centipawn")
sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_80100, kind="kde", ax=axes[1][1]);
axes[1][1].set_title("Kernel density for mistakes in games with length 80 - 100")
axes[1][1].set_xlabel("ply")
axes[1][1].set_ylabel("centipawn")


Text(0, 0.5, 'centipawn')

In [16]:
f2.tight_layout()
f2.savefig("kdemistakelengths.pgf")
f2.savefig("kdemistakelengths.pdf")


In [17]:
sns.scatterplot(x="best_move_score_diff", y="score_change", data=df_move_mistake_clean)

<matplotlib.axes._subplots.AxesSubplot at 0x17f29290>

In [18]:
df_move_80100_normal_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 80 & length <= 100')["id"])].query('best_move_score_diff < 30')
df_move_80100_normal = remove_outlier(df_move_80100_normal_dirty, "best_move_score_diff")
df_move_80100_inacc_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 80 & length <= 100')["id"])].query('best_move_score_diff >= 30 & best_move_score_diff < 100')
df_move_80100_inacc = remove_outlier(df_move_80100_inacc_dirty, "best_move_score_diff")
df_move_80100_mistake_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 80 & length <= 100')["id"])].query('best_move_score_diff >= 100 & best_move_score_diff < 300')
df_move_80100_mistake = remove_outlier(df_move_80100_mistake_dirty, "best_move_score_diff")
df_move_80100_blunder_dirty = df_move[df_move["game_id"].isin(df_game.query('length >= 80 & length <= 100')["id"])].query('best_move_score_diff >= 300')
df_move_80100_blunder = remove_outlier(df_move_80100_blunder_dirty, "best_move_score_diff")

In [19]:
#sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_100120, kind="kde");

In [25]:

f, axes = plt.subplots(2, 2, figsize=(8, 8))
sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_80100_normal, kind="kde", shade_lowest=False, ax=axes[0][0]);
axes[0][0].set_title("Kernel density for normal moves in games with length 20 - 40")
axes[0][0].set_xlabel("ply")
axes[0][0].set_ylabel("centipawn")
#plot([0, ])
sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_80100_inacc, kind="kde", shade_lowest=False, ax=axes[0][1]);
axes[0][1].set_title("Kernel density for inaccuracies in games with length 40 - 60")
axes[0][1].set_xlabel("ply")
axes[0][1].set_ylabel("centipawn")
sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_80100_mistake, kind="kde", shade_lowest=False, ax=axes[1][0]);
axes[1][0].set_title("Kernel density for mistakes in games with length 60 - 80")
axes[1][0].set_xlabel("ply")
axes[1][0].set_ylabel("centipawn")
sns.jointplot(x="ply_number", y="best_move_score_diff", data=df_move_80100_blunder, kind="kde", shade_lowest=False, ax=axes[1][1]);
axes[1][1].set_title("Kernel density for blunders in games with length 80 - 100")
axes[1][1].set_xlabel("ply")
axes[1][1].set_ylabel("centipawn")


  f = plt.figure(figsize=(height, height))


Text(0, 0.5, 'centipawn')

In [26]:
f.tight_layout()
f.savefig("kdemistakedist80100.pgf")
f.savefig("kdemistakedist80100.pdf")

In [21]:

sns.distplot(df_move_80100_normal["best_move_score_diff"])
sns.distplot(df_move_80100_inacc["best_move_score_diff"])
sns.distplot(df_move_80100_mistake["best_move_score_diff"])
df_move_80100_blunder_cut = df_move_80100_blunder.query('best_move_score_diff < 500')
dist_mistakes = sns.distplot(df_move_80100_blunder_cut["best_move_score_diff"])
dist_mistakes.set_xlabel("centipawn")

Text(0.5, 49.99999999999995, 'centipawn')