# ECO variance comparison
To see how our variance measure compares with traiditon win% risk measures

In [1]:
import sys
import networkx as nx
import matplotlib as plt
import numpy as np
import re
import chess
import subprocess
import timeit
import pickle
import stockfish
from stockfish import Stockfish
import chess
import chess.engine
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

engine = chess.engine.SimpleEngine.popen_uci("../stockfish-10-64")
stockfish = Stockfish("../stockfish-10-64")

## Load functions

In [2]:
def varWeighted(scores, counts):
    if (len(counts)<2):
        return None
    else:
        weightedMean = sum([a*b for a,b in zip(scores,counts)])/sum(counts)
        scoreDiffs = [(score - weightedMean)**2 for score in scores]
        V1 = sum(counts)
        V2 = sum([count**2 for count in counts])
        var = (V1/(V1**2-V2)) * sum([a*b for a,b in zip(counts,scoreDiffs)])
        return var
    
def get_score(san):
    board = chess.Board()
    for move in san:
        board.push_san(move)
    currFen = board.fen()
    score = engine.analyse(board, chess.engine.Limit(time=.05), info=chess.engine.INFO_SCORE)
    if('#' in str(score['score'])):
        if('-' in str(score['score'])):
            return -39765
        else:
            return 39765
    else:
        score = int(str(score["score"]))
    return score

def san_to_fen(san):
    board = chess.Board()
    for move in san:
        try:
            board.push_san(move)
            currFen = board.fen()
        except:
            currFen = None
    return currFen

def get_node_sd(node):
    scores = []
    counts = []
    try:
        for neighbor in list(g.neighbors(node)):
            scores.append(nx.get_node_attributes(g, 'score')[neighbor])
            counts.append(nx.get_node_attributes(g, 'movelistCount')[neighbor][node])
        return node, np.sqrt(varWeighted(scores,counts))
    except:
        return node, None

## Load games graph

In [3]:
g = nx.read_gpickle('full690k.gpickle')
len(g.nodes())

10346

In [4]:
# Get all ECO openings from .tsv files, not using this since the win % file has more info

# import pandas as pd
# import glob

# path = r'../eco_files/' # use your path
# all_files = glob.glob(path + "/*.tsv")

# li = []

# for filename in all_files:
#     df = pd.read_csv(filename, index_col=None, header=0, sep = '\t')
#     li.append(df)

# eco_df = pd.concat(li, axis=0, ignore_index=True)
# print(eco_df.shape)
# eco_df.head(2)

## Read in df

In [5]:
eco_df = pd.read_csv('../eco_files/ECO_win_percent_chesstempo.csv')
eco_df.san = eco_df.san.astype(str)
eco_df.san = eco_df.san.apply(lambda x: " ".join(x.split()))
eco_df.san = eco_df.san.str.split(' ').tolist()
eco_df['fen'] = eco_df.san.apply(lambda x: san_to_fen(x))
eco_df['move_count'] = eco_df['san'].apply(lambda x: len(x))
print("Total NA fen:", eco_df['fen'].isna().sum())
eco_df.head(2)

Total NA fen: 0


Unnamed: 0,Opening,Num Games,ECO,Perf Rating,Avg Player,Player Win %,Draw %,Opponent Win %,san,fen,move_count
0,"Sicilian Defense, Closed Variation",26244,B23,2112,2087,0.37,0.24,0.39,"[e4, c5, Nc3]",rnbqkbnr/pp1ppppp/8/2p5/4P3/2N5/PPPP1PPP/R1BQK...,3
1,"King's Indian Attack, General",23034,A07,2316,2299,0.4,0.34,0.26,"[Nf3, d5, g3]",rnbqkbnr/ppp1pppp/8/3p4/8/5NP1/PPPPPP1P/RNBQKB...,3


In [6]:
eco_df['fen'] = eco_df.san.apply(lambda x: san_to_fen(x))
eco_df.head(2)

Unnamed: 0,Opening,Num Games,ECO,Perf Rating,Avg Player,Player Win %,Draw %,Opponent Win %,san,fen,move_count
0,"Sicilian Defense, Closed Variation",26244,B23,2112,2087,0.37,0.24,0.39,"[e4, c5, Nc3]",rnbqkbnr/pp1ppppp/8/2p5/4P3/2N5/PPPP1PPP/R1BQK...,3
1,"King's Indian Attack, General",23034,A07,2316,2299,0.4,0.34,0.26,"[Nf3, d5, g3]",rnbqkbnr/ppp1pppp/8/3p4/8/5NP1/PPPPPP1P/RNBQKB...,3


In [7]:
eco_list = list(eco_df['fen'])
sd_list = []
node_fen = list(g.nodes())
for eco in eco_list:
    currFen = [fen for fen in node_fen if eco in fen]
    if(len(currFen)>0):
        currFen = str(currFen[0])
    scores = []
    counts = []
    try:
        for neighbor in list(g.neighbors(currFen)):
            scores.append(nx.get_node_attributes(g, 'score')[neighbor])
            counts.append(nx.get_node_attributes(g, 'movelistCount')[neighbor][currFen])
        sd_list.append(np.sqrt(varWeighted(scores,counts)))
#         print('{} {} ||| SD :'.format(key, game), np.sqrt(varWeighted(scores,counts)), ", total Games:", sum(counts))
    except:
        sd_list.append(None)

#### Check how many NA in df, i.e. how many fens from ECO do not show up in graph

In [8]:
eco_df['sd'] = sd_list
print("Total ECO:", len(eco_df['fen']))
print("Total NA:", eco_df['sd'].isna().sum())

Total ECO: 384
Total NA: 308


In [9]:
is_NaN = eco_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = eco_df[row_has_NaN]

print(rows_with_NaN.shape)
rows_with_NaN.head()

(308, 12)


Unnamed: 0,Opening,Num Games,ECO,Perf Rating,Avg Player,Player Win %,Draw %,Opponent Win %,san,fen,move_count,sd
6,"Sicilian Defense, Najdorf Variation, English A...",16870,B90,2303,2277,0.39,0.28,0.34,"[e4, c5, Nf3, d6, d4, cxd4, Nxd4, Nf6, Nc3, a6...",rnbqkb1r/1p2pppp/p2p1n2/8/3NP3/2N1B3/PPP2PPP/R...,11,
9,"English Opening, Anglo-Indian Defense, King's ...",15322,A15,2347,2328,0.4,0.37,0.24,"[c4, Nf6, Nf3]",rnbqkb1r/pppppppp/5n2/8/2P5/5N2/PP1PPPPP/RNBQK...,3,
10,"King's Indian Defense, Normal Variation, King'...",14459,E60,2281,2259,0.38,0.33,0.29,"[d4, Nf6, c4, g6, Nf3]",rnbqkb1r/pppppp1p/5np1/8/2PP4/5N2/PP2PPPP/RNBQ...,5,
11,"Queen Pawn Game, London System",14288,D02,2074,2049,0.36,0.31,0.33,"[d4, d5, Nf3, Nf6, Bf4]",rnbqkb1r/ppp1pppp/5n2/3p4/3P1B2/5N2/PPP1PPPP/R...,5,
14,"Sicilian Defense, Canal Attack",13659,B51,2315,2286,0.31,0.34,0.36,"[e4, c5, Nf3, d6, Bb5+]",rnbqkbnr/pp2pppp/3p4/1Bp5/4P3/5N2/PPPP1PPP/RNB...,5,


#### Sort by sd to get highest variance openings

In [10]:
eco_df.sort_values(by = ['sd'], ascending = False)

Unnamed: 0,Opening,Num Games,ECO,Perf Rating,Avg Player,Player Win %,Draw %,Opponent Win %,san,fen,move_count,sd
105,"Queen Pawn Game, Levitsky Attack",4752,D00,2210,2194,0.43,0.29,0.27,"[d4, d5, Bg5]",rnbqkbnr/ppp1pppp/8/3p2B1/3P4/8/PPP1PPPP/RN1QK...,3,456.594203
5,"French Defense, Exchange Variation",18816,C01,2062,2034,0.25,0.39,0.36,"[e4, e6, d4, d5, exd5]",rnbqkbnr/ppp2ppp/4p3/3P4/3P4/8/PPP2PPP/RNBQKBN...,5,79.123390
13,"Italian Game, Classical Variation, Giuoco Pian...",13738,C53,2222,2192,0.39,0.32,0.29,"[e4, e5, Nf3, Nc6, Bc4, Bc5, c3, Nf6, d3]",r1bqk2r/pppp1ppp/2n2n2/2b1p3/2B1P3/2PP1N2/PP3P...,9,73.539105
20,"Slav Defense, Modern Line",12671,D11,2336,2308,0.45,0.33,0.22,"[d4, d5, c4, c6, Nf3]",rnbqkbnr/pp2pppp/2p5/3p4/2PP4/5N2/PP2PPPP/RNBQ...,5,72.831998
148,"French Defense, Exchange Variation, Monte Carl...",3438,C01,2099,2087,0.36,0.29,0.35,"[e4, e6, d4, d5, exd5, exd5, c4]",rnbqkbnr/ppp2ppp/8/3p4/2PP4/8/PP3PPP/RNBQKBNR ...,7,69.296465
...,...,...,...,...,...,...,...,...,...,...,...,...
379,"Nimzo-Indian Defense, Saemisch Variation, Kere...",1037,E25,2437,2404,0.33,0.44,0.24,"[d4, Nf6, c4, e6, Nc3, Bb4, a3, Bxc3+, bxc3, c...",rnbqk2r/pp3ppp/4p3/2Pn4/8/P1P2P2/4P1PP/R1BQKBN...,15,
380,"Alekhine Defense, Two Pawn Attack, Lasker Vari...",1030,B02,2200,2190,0.33,0.30,0.37,"[e4, Nf6, e5, Nd5, c4, Nb6, c5]",rnbqkb1r/pppppppp/1n6/2P1P3/8/8/PP1P1PPP/RNBQK...,7,
381,"Trompowsky Attack, Raptor Variation",1027,A45,2285,2246,0.41,0.29,0.30,"[d4, Nf6, Bg5, Ne4, h4]",rnbqkb1r/pppppppp/8/6B1/3Pn2P/8/PPP1PPP1/RN1QK...,5,
382,"Pirc Defense, Austrian Attack, Kurajica Variation",1017,B09,2304,2264,0.43,0.23,0.34,"[e4, d6, d4, Nf6, Nc3, g6, f4, Bg7, Nf3, O-O, ...",rnbq1rk1/ppp1ppbp/3p1np1/8/3PPP2/2N1BN2/PPP3PP...,11,


In [11]:
eco_df[eco_df['san'].apply(lambda x : x == ['e4', 'c5', 'c3'])]

Unnamed: 0,Opening,Num Games,ECO,Perf Rating,Avg Player,Player Win %,Draw %,Opponent Win %,san,fen,move_count,sd
2,"Sicilian Defense, Alapin Variation, General",21841,B22,2201,2161,0.37,0.29,0.34,"[e4, c5, c3]",rnbqkbnr/pp1ppppp/8/2p5/4P3/2P5/PP1P1PPP/RNBQK...,3,17.91967


#### Check sd compared to draw% for popular games

In [None]:
gamesDict = {"root" : ['e4', 'c5'],
"safe" : ['e4', 'c5', 'c3'],
"risky" : ['e4', 'c5', 'd4'],
"root2" : ['e4', 'e6', 'd4', 'd5'],
"popular2.0" : ['e4', 'e6', 'd4', 'd5', 'Nc3'],
"popular2.1" : ['e4', 'e6', 'd4', 'd5', 'Nd2'],
"safe2" : ['e4', 'e6', 'd4', 'd5', 'exd5'],
"risky2" : ['e4', 'e6', 'd4', 'd5', 'e5'],
"Ruy" : ['e4', 'e5','Nf3', 'Nc6','Bb5'],
"Vienna" : ['e4' ,'e5','d4','exd4','c3']}

#eco_df[eco_df['san'].apply(lambda x : x == ['e4','c5'])]['Draw %']
for key, game in gamesDict.items():
    try:
        draw = float(eco_df[eco_df['san'].apply(lambda x : x == game)]['Draw %'])
    except:
        draw = None
    currFen = san_to_fen(game)
    scores = []
    counts = []
    try:
        for neighbor in list(g.neighbors(currFen)):
            scores.append(nx.get_node_attributes(g, 'score')[neighbor])
            counts.append(nx.get_node_attributes(g, 'movelistCount')[neighbor][currFen])
        print('{} {} ||| SD :'.format(key, game), np.sqrt(varWeighted(scores,counts)), "DRAW:", draw)
    except:
        break

## Plot relationship between sd and draw%

In [None]:
import matplotlib.pyplot as plt

# An "interface" to matplotlib.axes.Axes.hist() method
n, bins, patches = plt.hist(x=eco_df['sd'], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.8)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
maxfreq = n.max()
# Set a clean upper y-axis limit.
plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
plt.xlim(xmin = 0, xmax = 500)

In [None]:
import seaborn as sns
print("CORR:",eco_df['Draw %'].corr(eco_df['sd']))
sns.lmplot('Draw %', 'sd', data = eco_df)
plt.ylim(0,150)
plt.xlim(None,0.45)

In [None]:
print("CORR:",eco_df['move_count'].corr(eco_df['sd']))
sns.lmplot('move_count', 'sd', data = eco_df)
plt.ylim(0,150)
plt.xlim(0,10)