In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
from sklearn.model_selection import train_test_split

move_bins_for_WCL=np.arange(0,150,5)
eval_bins=np.arange(-20.05,20.15,0.1)
move_bins_for_eval=move_bins_for_WCL


### Get List of Games, Train Test Split

In [None]:
import anal_games, functions_anal

# Get the list of filenames matching the patterns
filenames_15 = glob.glob("../Cleaned_Analyzed_Games/twic*_15_processed.csv")
filenames_16 = glob.glob("../Cleaned_Analyzed_Games/twic*_16_processed.csv")

# For dupes, use the bigger depth
filenames_to_process=filenames_16
for file in filenames_15:
    if '_'.join(file.split('_')[:3])+'_16_processed.csv' in filenames_to_process:
        continue
    else:
        filenames_to_process.append(file)

outfile='../Cleaned_Analyzed_Games/all_games_cleaned.csv'

# make list of games 
anal_games.process_all_files(outfile=outfile,filenames=filenames_to_process,functions=[functions_anal.MovesTotal,functions_anal.Cleanup,functions_anal.MovesBlack,functions_anal.MovesWhite],skip_if_processed=True,game_wise=True)

df=pd.read_csv(outfile)

df_train,df_test=train_test_split(df,test_size=0.2,random_state=100) # stratification with number of moves or elos doesn't work, as it needs at least two games for each unique value/combination of values. Binning doesn't help

# save training set 
df_train.to_csv('../Cleaned_Analyzed_Games/all_games_cleaned_train.csv',index=False)
df_test.to_csv('../Cleaned_Analyzed_Games/all_games_cleaned_test.csv',index=False)

### Get Winning, Losing and Drawing Chances from Training set

In [None]:
import winning_chances_util

# make bins for evaluations
eval_bins=np.arange(-20.05,20.15,0.1)

# make bins for moves
move_bins_for_WCL=np.arange(0,150,5)

# Compute winning chance tables from training data
df=pd.read_csv('../Cleaned_Analyzed_Games/all_games_cleaned_train.csv')
winning_chances_util.compute_winning_chance_table(df, intervals=eval_bins,movebins=move_bins_for_WCL,outdir='../winning_chances_tables')

# Same, but only one move bin
winning_chances_util.compute_winning_chance_table(df, intervals=eval_bins,movebins='all',outdir='../winning_chances_tables')


### Calculate Winning Chance Loss for each Game

In [None]:
import copy
import concurrent.futures
import winning_chances_util
import anal_games

df=pd.read_csv('../Cleaned_Analyzed_Games/all_games_cleaned_train.csv')

process_by_move=True # do we want to have one winning table, or one for each move bracket?

num_workers=15

# for win chances binned by move
move_bins_for_eval=move_bins_for_WCL
wc_tables=winning_chances_util.read_winning_tables(dir='../winning_chances_tables/',movebins=move_bins_for_WCL)
args = []
i_process=0

for i in range(len(move_bins_for_eval)-1):

    
    wc_tables_new=copy.deepcopy(wc_tables)

    # bin moves
    wc_tables_new['mv_min']=move_bins_for_eval[i]
    wc_tables_new['mv_max']=move_bins_for_eval[i+1]

    # only process games with moves in bin
    df_moves=df.where(df['MovesAll']>=move_bins_for_eval[i])
    df_moves=df_moves.where(df['MovesAll']<move_bins_for_eval[i+1])
    df_moves.dropna(how='any',inplace=True)

    if df_moves.shape[0]==0:
        continue

    args.append(('../Cleaned_Analyzed_Games/wcl_train_'+str(move_bins_for_eval[i])+'-'+str(move_bins_for_eval[i+1])+'.csv',df_moves,[(winning_chances_util.WinChanceIncrease,wc_tables_new)],False,True))
    i_process+=1

wc_tables_new=copy.deepcopy(wc_tables)
wc_tables_new['mv_min']=move_bins_for_eval[-1]
wc_tables_new['mv_max']=100000
args.append(('../Cleaned_Analyzed_Games/wcl_train_'+str(move_bins_for_eval[-1])+'-.csv',df_moves,[(winning_chances_util.WinChanceIncrease,wc_tables_new)],False,True))
i_process+=1

for i in range(len(move_bins_for_eval)-1):

    
    wc_tables_new=copy.deepcopy(wc_tables)

    # bin moves
    wc_tables_new['mv_min']=move_bins_for_eval[i]
    wc_tables_new['mv_max']=move_bins_for_eval[i+1]

    # only process games with moves in bin
    df_moves=df.where(df['MovesAll']>=move_bins_for_eval[i])
    df_moves=df_moves.where(df['MovesAll']<move_bins_for_eval[i+1])
    df_moves.dropna(how='any',inplace=True)

    if df_moves.shape[0]==0:
        continue

    args.append(('../Cleaned_Analyzed_Games/wcl_test_'+str(move_bins_for_eval[i])+'-'+str(move_bins_for_eval[i+1])+'.csv',df_moves,[(winning_chances_util.WinChanceIncrease,wc_tables_new)],False,True))
    i_process+=1

wc_tables_new=copy.deepcopy(wc_tables)
wc_tables_new['mv_min']=move_bins_for_eval[-1]
wc_tables_new['mv_max']=100000
args.append(('../Cleaned_Analyzed_Games/wcl_test_'+str(move_bins_for_eval[-1])+'-.csv',df_moves,[(winning_chances_util.WinChanceIncrease,wc_tables_new)],False,True))
i_process+=1

# run it in parallel
with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
    tasks = []
    for argument in args:
        tasks.append(executor.submit(anal_games.process_game_list, argument[0],argument[1],argument[2],argument[3],argument[4]))

# for winchances not binned by move

movebins='all'

wc_tables=winning_chances_util.read_winning_tables(dir='../winning_chances_tables/',movebins=movebins)
args = []
i_process=0

for i in range(len(move_bins_for_eval)-1):

    
    wc_tables_new=copy.deepcopy(wc_tables)

    # bin moves
    wc_tables_new['mv_min']=move_bins_for_eval[i]
    wc_tables_new['mv_max']=move_bins_for_eval[i+1]

    # only process games with moves in bin
    df_moves=df.where(df['MovesAll']>=move_bins_for_eval[i])
    df_moves=df_moves.where(df['MovesAll']<move_bins_for_eval[i+1])
    df_moves.dropna(how='any',inplace=True)

    if df_moves.shape[0]==0:
        continue

    args.append(('../Cleaned_Analyzed_Games/wcl_train_all_'+str(move_bins_for_eval[i])+'-'+str(move_bins_for_eval[i+1])+'.csv',df_moves,[(winning_chances_util.WinChanceIncrease,wc_tables_new)],True,True))
    i_process+=1

wc_tables_new=copy.deepcopy(wc_tables)
wc_tables_new['mv_min']=move_bins_for_eval[-1]
wc_tables_new['mv_max']=100000
args.append(('../Cleaned_Analyzed_Games/wcl_train_all_'+str(move_bins_for_eval[-1]+5)+'-.csv',df_moves,[(winning_chances_util.WinChanceIncrease,wc_tables_new)],False,True))
i_process+=1

for i in range(len(move_bins_for_eval)-1):

    
    wc_tables_new=copy.deepcopy(wc_tables)

    # bin moves
    wc_tables_new['mv_min']=move_bins_for_eval[i]
    wc_tables_new['mv_max']=move_bins_for_eval[i+1]

    # only process games with moves in bin
    df_moves=df.where(df['MovesAll']>=move_bins_for_eval[i])
    df_moves=df_moves.where(df['MovesAll']<move_bins_for_eval[i+1])
    df_moves.dropna(how='any',inplace=True)

    if df_moves.shape[0]==0:
        continue

    args.append(('../Cleaned_Analyzed_Games/wcl_test_all_'+str(move_bins_for_eval[i])+'-'+str(move_bins_for_eval[i+1])+'.csv',df_moves,[(winning_chances_util.WinChanceIncrease,wc_tables_new)],True,True))
    i_process+=1

wc_tables_new=copy.deepcopy(wc_tables)
wc_tables_new['mv_min']=move_bins_for_eval[-1]
wc_tables_new['mv_max']=100000
args.append(('../Cleaned_Analyzed_Games/wcl_test_all_'+str(move_bins_for_eval[-1]+5)+'-.csv',df_moves,[(winning_chances_util.WinChanceIncrease,wc_tables_new)],False,True))
i_process+=1

with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
    tasks = []
    for argument in args:
        tasks.append(executor.submit(anal_games.process_game_list, argument[0],argument[1],argument[2],argument[3],argument[4]))


### Reshape output tables to have tables by player

In [None]:
from make_WCL_table import WCL_by_player

# for winchances calculated with binned moves
for i in move_bins_for_eval[-1]:

    WCL_by_player(move_bins_for_eval[i],move_bins_for_eval[i+1],all=False,train=True)
    
WCL_by_player(move_bins_for_eval[-1],None,all=False,train=True)

for i in move_bins_for_eval[-1]:

    WCL_by_player(move_bins_for_eval[i],move_bins_for_eval[i+1],all=False,train=False)
    
WCL_by_player(move_bins_for_eval[-1],None,all=False,train=False)


# for winchances calculated with all moves
for i in move_bins_for_eval[-1]:

    WCL_by_player(move_bins_for_eval[i],move_bins_for_eval[i+1],all=True,train=True)
    
WCL_by_player(move_bins_for_eval[-1],None,all=True,train=True)

for i in move_bins_for_eval[-1]:

    WCL_by_player(move_bins_for_eval[i],move_bins_for_eval[i+1],all=True,train=False)
    
WCL_by_player(move_bins_for_eval[-1],None,all=True,train=False)

### Bin mistakes

In [None]:
from make_mistakes_table import Mistakes_by_player

# make mistake bins and mistake labels
mistake_bins = [5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 100]
# start move to count mistakes (counting starts at 0)
move_start=3

# for winchances calculated with binned moves
for i in range(len(move_bins_for_eval[:-1])):

    Mistakes_by_player(move_bins_for_eval[i],move_bins_for_eval[i+1],all=False,train=True,mistake_bins=mistake_bins,move_start=move_start)
    
Mistakes_by_player(move_bins_for_eval[-1],None,all=False,train=True,mistake_bins=mistake_bins,move_start=move_start)

for i in range(len(move_bins_for_eval[:-1])):

    Mistakes_by_player(move_bins_for_eval[i],move_bins_for_eval[i+1],all=False,train=False,mistake_bins=mistake_bins,move_start=move_start)
    
Mistakes_by_player(move_bins_for_eval[-1],None,all=False,train=False,mistake_bins=mistake_bins,move_start=move_start)

# for winchances calculated with all moves
for i in range(len(move_bins_for_eval[:-1])):

    Mistakes_by_player(move_bins_for_eval[i],move_bins_for_eval[i+1],all=True,train=True,mistake_bins=mistake_bins,move_start=move_start)
    
Mistakes_by_player(move_bins_for_eval[-1],None,all=True,train=True,mistake_bins=mistake_bins,move_start=move_start)

for i in range(len(move_bins_for_eval[:-1])):

    Mistakes_by_player(move_bins_for_eval[i],move_bins_for_eval[i+1],all=True,train=False,mistake_bins=mistake_bins,move_start=move_start)
    
Mistakes_by_player(move_bins_for_eval[-1],None,all=True,train=False,mistake_bins=mistake_bins,move_start=move_start)

### Data Analysis

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error


all=False

def get_filename(mv_start,mv_end,train,all):
    filename='../Cleaned_Analyzed_Games/wcl_and_mistakes_'
    if train:
        filename+='train_'
    else:
        filename+='test_'
    if all:
        filename+='all_'
    if mv_end==None:
        filename+=str(mv_start)
    else:
        filename+=str(mv_start)+'-'+str(mv_end)
    filename+='_by_player.csv'
    return filename


for i in [12]: #range(len(move_bins_for_eval[:-1])):

    mv_start=move_bins_for_eval[i]
    mv_end=move_bins_for_eval[i+1]

    filename=get_filename(mv_start,mv_end,train=True,all=all)

    df=pd.read_csv(filename)

    features=[]
    for i in range(1,(mv_start//2)):
        features.append('WCL_'+str(i))
    
    df_white=df.where(df['Player']=='White') # check which training games are in that file
    df_white.dropna(how='any',inplace=True)

    lr_white=LinearRegression()
    
    lr_white.fit(df_white[features],df_white['Elo'].astype(float))

    filename_test=get_filename(mv_start,mv_end,train=False,all=all)

    df_test=pd.read_csv(filename_test)

    df_test_white=df.where(df_test['Player']=='White') # check which training games are in that file
    df_test_white.dropna(how='any',inplace=True)

    print('White Coeffs',lr_white.coef_)
    print('White Intercept',lr_white.intercept_)

    elo_pred_white=lr_white.predict(df_test_white[features])
    residuals_white=df_test_white['Elo']-elo_pred_white
    print('RMS White ', root_mean_squared_error(df_test_white['Elo'], elo_pred_white))

    player_games=  df_test_white[(df_test_white['Elo'] >= 1400) & (df_test_white['Elo'] <= 1500)]

    player_games = player_games.head(20)

    out=lr_white.predict(player_games[features])

    print(out)

    plt.figure()
    plt.scatter(elo_pred_white,residuals_white,label='White',s=.1)
    # plt.scatter(elo_pred_black,residuals_black,label='Black',s=.1)
    plt.legend()
    plt.xlim([1750,2500])
    plt.savefig('Residuals_vs_predicted_'+str(mv_start)+'_'+str(mv_end)+'.png')

    size_move_bins=5

    




In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBRegressor
import pandas as pd

from sklearn.metrics import mean_squared_error

models = {
    'lr': LinearRegression(),
    'svr': SVR(),
    'knr': KNeighborsRegressor(n_neighbors=10),
    'dt' : DecisionTreeClassifier(),
    'rf': RandomForestRegressor(),
    'et': ExtraTreesRegressor(),
    'ab': AdaBoostRegressor(),
    'gb': GradientBoostingRegressor(),
    'xbg': XGBRegressor()
}

all=False

def get_filename(mv_start,mv_end,train,all):
    filename='../Cleaned_Analyzed_Games/wcl_and_mistakes_'
    if train:
        filename+='train_'
    else:
        filename+='test_'
    if all:
        filename+='all_'
    if mv_end==None:
        filename+=str(mv_start)
    else:
        filename+=str(mv_start)+'-'+str(mv_end)
    filename+='_by_player.csv'
    return filename


for i in [12]: #range(len(move_bins_for_eval[:-1])):

    mv_start=move_bins_for_eval[i]
    mv_end=move_bins_for_eval[i+1]

    filename=get_filename(mv_start,mv_end,train=True,all=all)
    print(filename)

    df=pd.read_csv(filename)

    features=[]
    for i in range(1,(mv_start//2)):
        features.append('WCL_'+str(i))
    target='Elo'
    
    df_white=df.where(df['Player']=='White') # check which training games are in that file
    df_white.dropna(how='any',inplace=True)

    ## make a validation set
    df_tt, df_val = train_test_split(df_white, 
                                        shuffle=True,
                                        random_state=216,
                                        test_size=.1)

    mses = {}
    for name, model in models.items():
        print('testing & validation')
        model.fit(df_tt[features],df_tt[target])
        mses[name] = mean_squared_error(df_val[target], model.predict(df_val[features]))

        player_games=  df_val[(df_val[target] >= 1400) & (df_val[target] <= 1500)]

        player_games = player_games.head(20)

        out=model.predict(player_games[features])

        print(name)

        print(out)
    
    filename_test=get_filename(mv_start,mv_end,train=False,all=all)
    print(filename_test)

    df_test=pd.read_csv(filename_test)

    df_test_white=df.where(df_test['Player']=='White') # check which training games are in that file
    df_test_white.dropna(how='any',inplace=True)
    
    mses2 = {}
    for name, model in models.items():
        print('train & test')
        model.fit(df_white[features],df_white[target])
        mses2[name] = mean_squared_error(df_test_white[target], model.predict(df_test_white[features]))

        player_games=  df_test_white[(df_test_white[target] >= 1400) & (df_test_white[target] <= 1500)]

        player_games = player_games.head(20)

        out=model.predict(player_games[features])

        print(name)

        print(out)

print(mses)
print(mses2)

../Cleaned_Analyzed_Games/wcl_and_mistakes_train_60-65_by_player.csv
testing & validation
lr
[2235.4569767  2193.09271722 2035.82316074 2206.13876893 2322.6298828
 2181.80022164 1962.15710764 2026.16945717 1970.23607065 2254.04256211
 2251.37855225 2267.95849632 2263.10992155 2244.49055866 2162.37347372
 2172.3710952  2333.81280639 2154.38427608 2297.97477593 2232.27121789]
testing & validation
svr
[2247.91710647 2238.42904669 2243.43151294 2274.51528649 2253.36857272
 2242.48394931 2181.4243959  2245.97394803 2246.57575188 2303.72642606
 2291.58879218 2273.64498861 2280.36449939 2255.46299858 2225.08531982
 2276.18258557 2325.85838808 2266.35500543 2274.09349635 2269.46494594]
testing & validation
knr
[2128.2 2190.9 2333.4 2076.2 2068.1 2273.8 2211.3 2083.4 2225.8 2200.3
 2302.  2244.8 2206.5 2140.5 2066.8 2096.3 2345.3 2017.5 2297.8 2304.8]
testing & validation
dt
[2154. 2289. 1673. 1724. 1971. 1976. 2144. 2279. 2320. 2303. 2251. 2557.
 1704. 2134. 1625. 2257. 2006. 2416. 2030. 2309.