In [None]:
import numpy as np
import random
from utils import MOVE, Move, SUITS, Card, Deck, CustomDict
from game_manager import GameManager
from RL_utils import State, Action
from players import PLAYERS, GreedyRandomPlayer, HarelPlayer, SemiRandomPlayer
from enum import Enum
import matplotlib.pyplot as plt


In [None]:


def train_reward_per_game(players, n):
    """
    players - the players in the game
    IMPORTANT NOTE: players[0] should be the player we're training, aka Harel
    n - number of rounds
    """
    harel = players[0]
    for i in range(n):
        gm = GameManager(players)
        while gm.running==True:
            for player in players:
                gm.step(player)
                if gm.running == False:
                    break               

        next_state = State(harel.deck, -1)#-1 as the 6th element in the State means the game was over
        if gm.get_game_leader() == harel:
            reward = 1
        else:
            reward = -1
        harel.update(next_state, reward)

def train_reward_per_round(players, n):
    """
    players - the players in the game
    IMPORTANT NOTE: players[0] should be the player we're training, aka Harel
    n - number of rounds
    """
    harel = players[0]
    for i in range(n):
        gm = GameManager(players)
        while gm.running==True:
            for player in players:
                gm.step(player)
                if gm.running == False:
                    break

                #the player before the trainded player in the round will determine the next
                #state since after his move we'll know what card is on the pile for harel's turn
                elif player == players[-1]:
                    next_state = State(harel.deck, gm.pile.cards[-1].number)
                    if gm.get_game_leader() == harel:
                        reward = 1
                    else: 
                        reward = -1
                    harel.update(next_state, reward)



        next_state = State(harel.deck, -1)#-1 as the 6th element in the State means the game was over
        if gm.get_game_leader() == harel:
            reward = 1
        else:
            reward = -1
        harel.update(next_state, reward)



In [None]:
def test(players, n=1000):
    """
    n: number of games to test on
    tests both harel's win-loss ration and harel's average score at the end of each game
    make sure harel isn't exploring and is in index 0 in the list players
    """
    harel = players[0]
    harel.p_exploration = 0.
    
    average_score = 0
    number_of_wins = 0
    for i in range(n):
        gm = GameManager(players)
        while gm.running==True:
            for player in players:
                gm.step(player)
                if gm.running == False:
                    #print('breaking')
                    break
        harel_score = harel.get_score()
        average_score = (harel_score + n*average_score)/(n+1)
        if gm.get_game_leader() == harel:
            number_of_wins += 1
    print('in test with players:')
    for player in players:
        print(player.player)
    print('TEST:average_score: ', average_score)
    print('TEST:number of wins: ', number_of_wins, ' / ', n, '\n')    
    return average_score, number_of_wins/n

In [None]:
########################################################################
############## PRINTING DATA FOR GRAPHS with reward-per-game############
########################################################################

#REPEAT THIS PROCESS FOR HYPERPARAMETERS IN A LOOP
#(lr=0.5,discount=0.9), (lr=0.5,discount=0.5), (lr=0.5,discount=0.1)
#(lr=0.1,discount=0.9), (lr=0.1,discount=0.5), (lr=0.1,discount=0.1)
lrs = [0.5, 0.5, 0.5, 0.1, 0.1, 0.1]
discounts = [0.9, 0.5, 0.1, 0.9, 0.5, 0.1]

#build players
srplayer = SemiRandomPlayer()
grplayer = GreedyRandomPlayer(0.5)


for i in range(len(lrs)):
    lr = lrs[i]
    discount = discounts[i]
    
    harel = HarelPlayer(lr, 0.5, discount)
    
    average_scores_vs_grplayer = []
    ratio_of_wins_vs_grplayer = []
    average_scores_vs_srplayer = []
    ratio_of_wins_vs_srplayer = []
    
    
    for p in range(2):
        #RUN with p_exploration=0.5 for the first 100,000 training rounds
        #loop to change between p_exploration, when p=0: p_exploration=0.5, when p=1: p_exploration=0.25
        if p==0:
            p_exploration = 0.5
        else:
            p_exploration = 0.25

        for interval in range(10):
            if interval < 5:
                players = [harel, srplayer]
            else:        
                players = [harel, grplayer]  

            harel.p_exporation = p_exploration
            train_reward_per_game(players, 20000)
            ##

            harel.p_exploration = 0.              

            score, ratio = test([harel, srplayer],1000)
            average_scores_vs_srplayer.append(score)
            ratio_of_wins_vs_srplayer.append(ratio)

            score, ratio = test([harel, grplayer],1000)
            average_scores_vs_grplayer.append(score)
            ratio_of_wins_vs_grplayer.append(ratio)


    print('WITH learning_rate =', lr, ', discount =', discount)
    print("average_scores_vs_grplayer:\n", average_scores_vs_grplayer)
    print("ratio_of_wins_vs_grplayer:\n", ratio_of_wins_vs_grplayer)
    print("average_scores_vs_srplayer\n", average_scores_vs_srplayer)
    print("ratio_of_wins_vs_srplayer:\n", ratio_of_wins_vs_srplayer)
    print('\n')

In [None]:
########################################################################
############## PRINTING DATA FOR GRAPHS with reward-per-round###########
########################################################################



#REPEAT THIS PROCESS FOR HYPERPARAMETERS IN A LOOP
#(lr=0.5,discount=0.9), (lr=0.5,discount=0.5), (lr=0.5,discount=0.1)
#(lr=0.1,discount=0.9), (lr=0.1,discount=0.5), (lr=0.1,discount=0.1)
lrs = [0.5, 0.5, 0.5, 0.1, 0.1, 0.1]
discounts = [0.9, 0.5, 0.1, 0.9, 0.5, 0.1]

#build players
srplayer = SemiRandomPlayer()
grplayer = GreedyRandomPlayer(0.5)


for i in range(len(lrs)):
    lr = lrs[i]
    discount = discounts[i]
    
    harel = HarelPlayer(lr, 0.5, discount)
    
    average_scores_vs_grplayer = []
    ratio_of_wins_vs_grplayer = []
    average_scores_vs_srplayer = []
    ratio_of_wins_vs_srplayer = []
    
    
    for p in range(2):
        #RUN with p_exploration=0.5 for the first 100,000 training rounds
        #loop to change between p_exploration, when p=0: p_exploration=0.5, when p=1: p_exploration=0.25
        if p==0:
            p_exploration = 0.5
        else:
            p_exploration = 0.25

        for interval in range(10):
            if interval < 5:
                players = [harel, srplayer]
            else:        
                players = [harel, grplayer]  

            harel.p_exporation = p_exploration
            train_reward_per_round(players, 20000)
            ##

            harel.p_exploration = 0.              

            score, ratio = test([harel, srplayer],1000)
            average_scores_vs_srplayer.append(score)
            ratio_of_wins_vs_srplayer.append(ratio)

            score, ratio = test([harel, grplayer],1000)
            average_scores_vs_grplayer.append(score)
            ratio_of_wins_vs_grplayer.append(ratio)


    print('WITH learning_rate =', lr, ', discount =', discount)
    print("average_scores_vs_grplayer:\n", average_scores_vs_grplayer)
    print("ratio_of_wins_vs_grplayer:\n", ratio_of_wins_vs_grplayer)
    print("average_scores_vs_srplayer\n", average_scores_vs_srplayer)
    print("ratio_of_wins_vs_srplayer:\n", ratio_of_wins_vs_srplayer)
    print('\n')

In [None]:


#
#Graphs showing results of Harel vs grplayer and srplayer with different learning rates and discounts
#Harel was training with 20,000 games between each interval. 
#Harel was trained with a p_exploration of 0.5 for the first 10 intervals (2,000,000 games) and then with 
#p_exploration of 0.25 for the next 10 intervals (2,000,000) games
#Harel was trained against srplayer for intervals: (1->5 and 11->15) 
#Harel was trained against grplayer for intervals: (6->10 and 16->20)
#Harel was tested on 1,000 games in each test
#


x_axis = np.arange(20000,20000*20+1, 20000)

########################################################################
#######################DATA for reward per game#########################
########################################################################



#########graph reward per game (lr=0.5,discount=0.9):
average_scores_vs_grplayer1 = [12.968517063775833, 12.1106166144166, 12.983346455467261, 12.764596724467447, 12.61710541719472, 12.663371986945013, 12.73503145004961, 12.287004088506682, 12.702458423463783, 13.098865418324877, 12.920714433595544, 12.397608596494425, 12.84594396849364, 12.593412784011196, 12.678561947905504, 12.474978510685004, 12.65447422382101, 12.315385288410672, 12.590320243280193, 12.609094163806388]
ratio_of_wins_vs_grplayer1 = [0.174, 0.202, 0.169, 0.192, 0.19, 0.187, 0.218, 0.23, 0.223, 0.203, 0.208, 0.232, 0.218, 0.238, 0.235, 0.237, 0.245, 0.261, 0.249, 0.234]
average_scores_vs_srplayer1 = [9.466779880714387, 9.187304351674223, 9.185984401237715, 9.480797479841133, 8.932957014539397, 8.95375143800043, 9.319723934836045, 8.919605049052647, 9.062014469026584, 9.046365670978224, 9.423216936133016, 8.973390304241754, 8.849264526282347, 8.968681662396413, 8.612743067308251, 8.324580895701615, 8.60264274126229, 8.809996308918329, 8.863358871856542, 8.495506306955694]
ratio_of_wins_vs_srplayer1 = [0.412, 0.431, 0.438, 0.42, 0.446, 0.439, 0.439, 0.454, 0.464, 0.455, 0.452, 0.473, 0.498, 0.475, 0.5, 0.518, 0.509, 0.506, 0.488, 0.51]
###################################################
#########graph reward per game (lr=0.5,discount=0.5):
average_scores_vs_grplayer2 = [12.631306095014361, 12.76777356569001, 12.749486064977658, 12.801137097718714, 12.895363065989818, 12.46126547055024, 12.519087331732825, 12.655876746448705, 12.995563297210605, 12.705484635024265, 12.752924707753687, 12.698284577288943, 12.621560292925476, 12.50729691166707, 13.05268698709075, 12.449799149294554, 12.678910126479332, 13.251770565526062, 12.712295860335061, 12.371050091284436]
ratio_of_wins_vs_grplayer2 = [0.21, 0.178, 0.18, 0.196, 0.191, 0.209, 0.224, 0.209, 0.188, 0.237, 0.217, 0.227, 0.24, 0.235, 0.219, 0.229, 0.223, 0.215, 0.231, 0.245]
average_scores_vs_srplayer2 = [9.603927134631137, 9.197214377942798, 8.834924298222122, 9.004632656025754, 9.102299823542984, 9.390049022561705, 8.852538366587634, 8.843210661184926, 8.959322816539988, 8.711434655578568, 8.868386212365875, 8.880200979287903, 8.968642132404032, 8.895267533988127, 8.945670379548899, 8.753795783243373, 8.693743106117799, 8.871590958888397, 9.00095203756735, 8.89882366556599]
ratio_of_wins_vs_srplayer2 = [0.404, 0.439, 0.456, 0.459, 0.464, 0.427, 0.465, 0.462, 0.454, 0.484, 0.446, 0.51, 0.456, 0.476, 0.486, 0.475, 0.5, 0.481, 0.493, 0.494]
###################################################
#########graph reward per game (lr=0.5,discount=0.1):
average_scores_vs_grplayer3 = [13.447561872862034, 13.173249636699214, 13.200475925513238, 13.150822492600863, 12.902444528326681, 12.594979535506516, 12.959221979505568, 12.722353974431362, 12.755766541857701, 12.588051553627324, 12.68783368034741, 12.576795895290864, 12.452020138495243, 12.790683177666718, 12.948132346603632, 12.44946519356033, 12.6753562007188, 11.941812421710866, 12.67078288689318, 12.685846506092483]
ratio_of_wins_vs_grplayer3 = [0.153, 0.171, 0.186, 0.174, 0.198, 0.209, 0.195, 0.23, 0.226, 0.228, 0.217, 0.21, 0.231, 0.244, 0.209, 0.235, 0.232, 0.267, 0.255, 0.246]
average_scores_vs_srplayer3 = [9.104697317865401, 9.451537273834667, 8.959740745233065, 9.070807875574664, 8.956003382224234, 9.580486158343069, 8.67995652586283, 8.832259958492768, 9.03223076656978, 8.518517559437974, 8.485731835286886, 8.745346652126813, 8.504015085781155, 8.856335026132077, 8.695291606434127, 8.754099179689538, 8.804388556321852, 9.238920578663919, 8.916826811048821, 8.70314814772971]
ratio_of_wins_vs_srplayer3 = [0.428, 0.417, 0.442, 0.436, 0.464, 0.418, 0.47, 0.479, 0.465, 0.497, 0.491, 0.474, 0.49, 0.472, 0.484, 0.484, 0.502, 0.468, 0.484, 0.489]
###################################################
#########graph reward per game (lr=0.1,discount=0.9):
average_scores_vs_grplayer4 = [12.68039981500432, 13.228560919291958, 12.664240507328513, 12.206778158753925, 13.3676322366876, 12.465119602146512, 13.052238010448546, 12.436417155869774, 12.564420986685988, 12.696972260278676, 13.173759174078407, 12.810063762805637, 12.713856638044769, 12.224244073178522, 12.636158647986132, 12.617211729637575, 12.442005335211004, 12.555918029976995, 12.404725754183533, 12.327839904651691]
ratio_of_wins_vs_grplayer4 = [0.192, 0.181, 0.222, 0.226, 0.184, 0.227, 0.222, 0.238, 0.232, 0.229, 0.194, 0.23, 0.233, 0.248, 0.228, 0.24, 0.244, 0.239, 0.247, 0.239]
average_scores_vs_srplayer4 = [9.245908810160824, 9.042439277230454, 8.866927043383361, 9.179661303659898, 8.770464254025596, 8.730722775168953, 8.954285126960894, 8.510880873465585, 8.968721260960697, 8.845603032949219, 8.54226214847979, 9.025757977585998, 8.756019924651774, 8.739341777468443, 8.955078421329567, 8.505985724090577, 9.02323400262824, 8.573965271600523, 8.74072410669537, 8.551427976028737]
ratio_of_wins_vs_srplayer4 = [0.431, 0.462, 0.479, 0.456, 0.497, 0.499, 0.495, 0.508, 0.485, 0.487, 0.506, 0.491, 0.509, 0.502, 0.485, 0.522, 0.496, 0.525, 0.513, 0.518]
###################################################
#########graph reward per game (lr=0.1,discount=0.5):
average_scores_vs_grplayer5 = [12.441952943977022, 12.924233839151382, 12.453359047998214, 13.199213207853745, 12.88074821767717, 12.826494523084483, 12.497140262572326, 12.53370458348689, 12.551225114060301, 12.388278442346483, 13.082617414464092, 12.994749985924427, 12.213475595769507, 12.807254409194108, 12.598764955607434, 12.686620065153527, 12.323346033998073, 12.557861482749303, 12.663186109370812, 12.76571901053233]
ratio_of_wins_vs_grplayer5 = [0.223, 0.207, 0.225, 0.188, 0.195, 0.22, 0.224, 0.232, 0.214, 0.239, 0.202, 0.203, 0.229, 0.231, 0.236, 0.219, 0.249, 0.233, 0.237, 0.215]
average_scores_vs_srplayer5 = [9.448692662659598, 9.170897368903955, 8.87956145568714, 9.121858558433132, 9.040419904819698, 8.774045408495189, 8.622859395149037, 8.981433304750833, 9.205580429480532, 8.610844409194433, 9.107483272322703, 8.792417039058597, 8.726881484072717, 9.078255426714458, 8.821982728792113, 8.817652991836688, 8.594310605164447, 8.720767162747523, 8.8313143523792, 8.765015299910864]
ratio_of_wins_vs_srplayer5 = [0.419, 0.444, 0.463, 0.459, 0.461, 0.495, 0.501, 0.474, 0.462, 0.478, 0.465, 0.485, 0.509, 0.483, 0.516, 0.471, 0.522, 0.512, 0.507, 0.52]
###################################################
#########graph reward per game (lr=0.1,discount=0.1):
average_scores_vs_grplayer6 = [12.5602871288328, 12.546181740110534, 12.7252327363453, 13.147791159048284, 12.597425084339623, 12.461390190528721, 12.788973317376517, 12.894765165601617, 12.63863489852777, 13.01977543395083, 12.332333340691688, 12.802089763779255, 12.741044244498294, 12.334975825568533, 12.728195046328105, 12.2070380510883, 12.332867493417961, 12.243056197881447, 12.72297660334894, 12.625822256662081]
ratio_of_wins_vs_grplayer6 = [0.23, 0.223, 0.199, 0.204, 0.22, 0.228, 0.226, 0.198, 0.235, 0.222, 0.236, 0.223, 0.226, 0.249, 0.222, 0.235, 0.246, 0.254, 0.226, 0.235]
average_scores_vs_srplayer6 = [9.213550237432663, 8.723221400236561, 8.783454877499663, 8.860039674699381, 9.301037590884803, 8.84399897001089, 9.106898621294327, 8.879665607713932, 9.233904057265185, 8.881717013874248, 8.725411698791184, 8.739875040082978, 9.244362267551528, 8.681274048666586, 8.709894964726848, 8.80944705570692, 8.626102453395244, 9.116946800550608, 8.763986308469915, 8.52379510634258]
ratio_of_wins_vs_srplayer6 = [0.453, 0.479, 0.493, 0.468, 0.458, 0.492, 0.462, 0.505, 0.462, 0.479, 0.495, 0.507, 0.468, 0.506, 0.518, 0.49, 0.496, 0.496, 0.5, 0.522]

##############################################################################
######################GRAPHS for reward per game##############################
##############################################################################



#########GRAPH 1##########
plt.plot(x_axis, average_scores_vs_grplayer1, color='r', label="lr=0.5,discount=0.9")
plt.plot(x_axis, average_scores_vs_grplayer2, color='g', label="lr=0.5,discount=0.5")
plt.plot(x_axis, average_scores_vs_grplayer3, color='b', label="lr=0.5,discount=0.1")
plt.plot(x_axis, average_scores_vs_grplayer4, color='c', label="lr=0.1,discount=0.9")
plt.plot(x_axis, average_scores_vs_grplayer5, color='m', label="lr=0.1,discount=0.5")
plt.plot(x_axis, average_scores_vs_grplayer6, color='y', label="lr=0.1,discount=0.1")

plt.xlabel("number of training games")
plt.ylabel("average scores")
plt.title("Average scores of Harel when playing against GreedyRandomPlayer\nover 1,000 testing games\ntrained with reward each game")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

##########GRAPH 2##########
plt.plot(x_axis, average_scores_vs_srplayer1, color='r', label="lr=0.5,discount=0.9")
plt.plot(x_axis, average_scores_vs_srplayer2, color='g', label="lr=0.5,discount=0.5")
plt.plot(x_axis, average_scores_vs_srplayer3, color='b', label="lr=0.5,discount=0.1")
plt.plot(x_axis, average_scores_vs_srplayer4, color='c', label="lr=0.1,discount=0.9")
plt.plot(x_axis, average_scores_vs_srplayer5, color='m', label="lr=0.1,discount=0.5")
plt.plot(x_axis, average_scores_vs_srplayer6, color='y', label="lr=0.1,discount=0.1")

plt.xlabel("number of training games")
plt.ylabel("average scores")
plt.title("Average scores of Harel when playing against SemiRandomPlayer\n over 1,000 testing games\ntrained with reward each game")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

#########GRAPH 3##########
plt.plot(x_axis, ratio_of_wins_vs_grplayer1, color='r', label="lr=0.5,discount=0.9")
plt.plot(x_axis, ratio_of_wins_vs_grplayer2, color='g', label="lr=0.5,discount=0.5")
plt.plot(x_axis, ratio_of_wins_vs_grplayer3, color='b', label="lr=0.5,discount=0.1")
plt.plot(x_axis, ratio_of_wins_vs_grplayer4, color='c', label="lr=0.1,discount=0.9")
plt.plot(x_axis, ratio_of_wins_vs_grplayer5, color='m', label="lr=0.1,discount=0.5")
plt.plot(x_axis, ratio_of_wins_vs_grplayer6, color='y', label="lr=0.1,discount=0.1")

plt.xlabel("number of training games")
plt.ylabel("wins ratio")
plt.title("Harel's wins ratio vs GreedyRandomPlayer\nover 1,000 testing games\ntrained with reward each game")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
#########GRAPH 4##########
plt.plot(x_axis, ratio_of_wins_vs_srplayer1, color='r', label="lr=0.5,discount=0.9")
plt.plot(x_axis, ratio_of_wins_vs_srplayer2, color='g', label="lr=0.5,discount=0.5")
plt.plot(x_axis, ratio_of_wins_vs_srplayer3, color='b', label="lr=0.5,discount=0.1")
plt.plot(x_axis, ratio_of_wins_vs_srplayer4, color='c', label="lr=0.1,discount=0.9")
plt.plot(x_axis, ratio_of_wins_vs_srplayer5, color='m', label="lr=0.1,discount=0.5")
plt.plot(x_axis, ratio_of_wins_vs_srplayer6, color='y', label="lr=0.1,discount=0.1")

plt.xlabel("number of training games")
plt.ylabel("wins ratio")
plt.title("Harel's wins ratio vs SemiRandomPlayer\nover 1,000 testing games\ntrained with reward each game")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()




########################################################################
#######################DATA for reward per round############################
########################################################################



#########graph reward per game (lr=0.5,discount=0.9):
average_scores_vs_grplayer1 = [10.719464560872458, 9.92621911848619, 9.060349596779828, 8.848716596475457, 8.718164542106958, 8.217113048761124, 8.082962976529227, 7.784837804730347, 7.76276931447382, 7.5105558005407325, 6.872227198116518, 7.03173911390947, 6.95352057931031, 6.8425256464961, 6.943741597581396, 6.732472188510815, 6.477935719576882, 6.634278749688136, 6.682394566980454, 6.797821122066663]
ratio_of_wins_vs_grplayer1 = [0.111, 0.154, 0.186, 0.185, 0.198, 0.292, 0.344, 0.383, 0.411, 0.407, 0.439, 0.392, 0.403, 0.378, 0.362, 0.41, 0.456, 0.434, 0.48, 0.469]
average_scores_vs_srplayer1 = [7.943534612509399, 6.478816805988253, 5.846134146342414, 6.146718084531284, 5.916937264969335, 5.526191932641101, 5.195174344286506, 5.053320221086229, 5.147338231698171, 4.767197634601893, 4.7039564234509665, 4.697501706074367, 4.639730549185607, 4.535937696003051, 4.704265251469844, 4.660829412971241, 4.329257990355903, 4.327654974027414, 4.347296943903231, 4.311588508207008]
ratio_of_wins_vs_srplayer1 = [0.153, 0.428, 0.454, 0.428, 0.453, 0.549, 0.566, 0.649, 0.648, 0.697, 0.653, 0.652, 0.64, 0.651, 0.608, 0.66, 0.706, 0.712, 0.731, 0.715]
###################################################
#########graph reward per game (lr=0.5,discount=0.5):
average_scores_vs_grplayer2 = [9.90608980165217, 8.828896315814562, 8.587270412966884, 8.101370838516285, 7.676491434011462, 8.296736470602845, 8.210850778539056, 8.028581282546691, 8.079638041587605, 7.74677458527984, 7.233561170137545, 6.530593417142605, 6.375955624768829, 6.263918529507383, 6.392195649104605, 6.690012701155617, 6.675648450705423, 6.812637697833127, 6.70807216314554, 6.826866162551627]
ratio_of_wins_vs_grplayer2 = [0.211, 0.323, 0.339, 0.343, 0.388, 0.423, 0.446, 0.45, 0.452, 0.463, 0.517, 0.524, 0.521, 0.524, 0.498, 0.505, 0.553, 0.541, 0.527, 0.555]
average_scores_vs_srplayer2 = [6.900380161060719, 5.496477661738045, 5.00955575875386, 4.662742576043199, 4.558981136076386, 4.938177874579399, 5.165845272993003, 5.091656220424164, 5.019411101021592, 4.783447406641399, 4.286432087146781, 4.396526188429745, 4.266733616644159, 4.1715679096290765, 4.18022605204703, 4.4669344725442635, 4.150228935388965, 4.42576117408029, 4.380540863866966, 4.142249680298747]
ratio_of_wins_vs_srplayer2 = [0.462, 0.643, 0.672, 0.699, 0.729, 0.735, 0.724, 0.744, 0.765, 0.778, 0.814, 0.803, 0.803, 0.8, 0.783, 0.787, 0.819, 0.798, 0.813, 0.824]
###################################################
#########graph reward per game (lr=0.5,discount=0.1):
average_scores_vs_grplayer3 = [10.208683827093534, 9.395143448891693, 8.844730082062348, 8.407474561310508, 8.2163785825109, 8.880176956635143, 8.811205661259578, 8.022426981478862, 8.309246498007427, 7.673994395302962, 7.166073466815567, 6.9640184428877046, 6.743923917440161, 6.516666997161304, 6.364306723212996, 7.10128147499074, 6.896722747472189, 6.6088940256882465, 7.133847234314612, 6.821167764158958]
ratio_of_wins_vs_grplayer3 = [0.264, 0.352, 0.386, 0.414, 0.396, 0.402, 0.399, 0.487, 0.467, 0.488, 0.504, 0.521, 0.535, 0.525, 0.536, 0.495, 0.52, 0.563, 0.507, 0.518]
average_scores_vs_srplayer3 = [6.508213501921749, 5.835736831426335, 5.306450991692594, 5.098443337907943, 4.917318438040879, 5.262105315458366, 5.314296938553145, 5.236001204851969, 4.845669736601249, 4.844921476113029, 4.635332217285719, 4.41296897218212, 4.383972820984888, 4.39695726520606, 4.173474861691101, 4.5345619303714715, 4.595204501493575, 4.638955295288238, 4.659183502930703, 4.459040151213708]
ratio_of_wins_vs_srplayer3 = [0.556, 0.667, 0.723, 0.765, 0.765, 0.74, 0.747, 0.757, 0.775, 0.793, 0.816, 0.825, 0.827, 0.807, 0.827, 0.826, 0.82, 0.813, 0.8, 0.813]
###################################################
#########graph reward per game (lr=0.1,discount=0.9):
average_scores_vs_grplayer4 = [9.795912691048485, 8.53152591165619, 8.489456430264797, 8.01283375347576, 7.880406680115418, 7.80395209374816, 7.575116912277119, 7.077403371387278, 7.183993838016567, 7.00947449341631, 6.540230486921406, 6.783245676408848, 6.461773963351658, 6.410707396304077, 6.760062821486606, 6.611872268277974, 6.5441362836896815, 6.363816621917179, 6.5845925089284405, 6.327923737888397]
ratio_of_wins_vs_grplayer4 = [0.246, 0.258, 0.249, 0.247, 0.238, 0.248, 0.282, 0.3, 0.318, 0.323, 0.345, 0.303, 0.347, 0.335, 0.314, 0.319, 0.336, 0.361, 0.32, 0.335]
average_scores_vs_srplayer4 = [6.4643043785792775, 5.923127240321961, 5.416437754754069, 5.282054774011097, 5.531895339869458, 5.424337641363297, 5.174170242193821, 5.2837644947552755, 5.082506946870456, 4.847367300213648, 4.730425240379968, 4.725895616255531, 4.934907282587359, 4.866883510292143, 4.845628562981921, 4.6840706915573715, 4.9361529375692275, 4.780389559541858, 4.841295531597102, 4.791546388619176]
ratio_of_wins_vs_srplayer4 = [0.464, 0.465, 0.512, 0.556, 0.482, 0.529, 0.536, 0.508, 0.571, 0.575, 0.569, 0.581, 0.548, 0.531, 0.539, 0.568, 0.536, 0.563, 0.572, 0.559]
###################################################
#########graph reward per game (lr=0.1,discount=0.5):
average_scores_vs_grplayer5 = [9.6840584345487, 8.655884739314553, 7.990099571586996, 7.633964179260028, 7.404778823954312, 7.370171061376352, 7.301798996901118, 7.066939771414729, 7.379880812645305, 7.080672671222596, 6.393705845225597, 6.158133684421392, 6.245753291361013, 6.161731407862417, 6.39613383503626, 6.01673155193486, 5.935332499581317, 6.255452206233609, 6.087631044843915, 5.875500505658903]
ratio_of_wins_vs_grplayer5 = [0.309, 0.334, 0.327, 0.364, 0.367, 0.378, 0.412, 0.439, 0.427, 0.442, 0.479, 0.523, 0.493, 0.487, 0.472, 0.513, 0.517, 0.499, 0.527, 0.536]
average_scores_vs_srplayer5 = [6.495552290502072, 5.5338745315772915, 5.299325015790118, 5.136921180828108, 4.902245783088169, 4.729065655060216, 4.995646715290889, 4.756762642945146, 4.75694356507296, 4.581660178769401, 4.443325928233589, 4.5927721020941945, 4.399930825428688, 4.376891842728841, 4.269326545638093, 4.535725532388372, 4.367135754148875, 4.263178846704297, 4.2766283339619084, 4.451286046425911]
ratio_of_wins_vs_srplayer5 = [0.54, 0.589, 0.63, 0.629, 0.633, 0.679, 0.664, 0.73, 0.737, 0.758, 0.74, 0.733, 0.769, 0.745, 0.755, 0.747, 0.76, 0.778, 0.78, 0.783]
###################################################
#########graph reward per game (lr=0.1,discount=0.1):
average_scores_vs_grplayer6 = [10.098361930971887, 8.548873884445342, 8.060842257115272, 7.346740191289044, 7.024746176779562, 7.553319320050578, 7.473839179554282, 7.360783654990958, 7.38362525334535, 7.234693969992595, 7.073575753319956, 6.5822579766606175, 6.2475180889181905, 6.053131106865234, 6.2262442349024845, 6.531689222696185, 6.346297512227524, 6.657039144266303, 6.641693004541015, 6.555436049696723]
ratio_of_wins_vs_grplayer6 = [0.284, 0.381, 0.39, 0.433, 0.454, 0.45, 0.479, 0.498, 0.493, 0.492, 0.493, 0.515, 0.541, 0.545, 0.544, 0.525, 0.547, 0.54, 0.54, 0.545]
average_scores_vs_srplayer6 = [6.551436346090291, 5.869310703595288, 5.257187941409386, 5.044379167897824, 4.913894137329528, 4.887022603726511, 4.740613864932575, 5.006583630546054, 4.637382252132456, 4.467515032721417, 4.857730621257493, 4.55828254615769, 4.318714136947481, 4.2499638823818815, 4.176174380091805, 4.179382294861991, 4.547572426965147, 4.561035806459104, 4.50640381974496, 4.437631472785672]
ratio_of_wins_vs_srplayer6 = [0.584, 0.62, 0.684, 0.71, 0.701, 0.744, 0.775, 0.743, 0.761, 0.784, 0.762, 0.779, 0.805, 0.802, 0.825, 0.827, 0.788, 0.788, 0.802, 0.802]
##############################################################################
######################GRAPHS for reward per round#############################
##############################################################################


#########GRAPH 1##########
plt.plot(x_axis, average_scores_vs_grplayer1, color='r', label="lr=0.5,discount=0.9")
plt.plot(x_axis, average_scores_vs_grplayer2, color='g', label="lr=0.5,discount=0.5")
plt.plot(x_axis, average_scores_vs_grplayer3, color='b', label="lr=0.5,discount=0.1")
plt.plot(x_axis, average_scores_vs_grplayer4, color='c', label="lr=0.1,discount=0.9")
plt.plot(x_axis, average_scores_vs_grplayer5, color='m', label="lr=0.1,discount=0.5")
plt.plot(x_axis, average_scores_vs_grplayer6, color='y', label="lr=0.1,discount=0.1")

plt.xlabel("number of training games")
plt.ylabel("average scores")
plt.title("Average scores of Harel when playing against GreedyRandomPlayer\nover 1,000 testing games\ntrained with reward each round")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

##########GRAPH 2##########
plt.plot(x_axis, average_scores_vs_srplayer1, color='r', label="lr=0.5,discount=0.9")
plt.plot(x_axis, average_scores_vs_srplayer2, color='g', label="lr=0.5,discount=0.5")
plt.plot(x_axis, average_scores_vs_srplayer3, color='b', label="lr=0.5,discount=0.1")
plt.plot(x_axis, average_scores_vs_srplayer4, color='c', label="lr=0.1,discount=0.9")
plt.plot(x_axis, average_scores_vs_srplayer5, color='m', label="lr=0.1,discount=0.5")
plt.plot(x_axis, average_scores_vs_srplayer6, color='y', label="lr=0.1,discount=0.1")

plt.xlabel("number of training games")
plt.ylabel("average scores")
plt.title("Average scores of Harel when playing against SemiRandomPlayer\n over 1,000 testing games\ntrained with reward each round")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

#########GRAPH 3##########
plt.plot(x_axis, ratio_of_wins_vs_grplayer1, color='r', label="lr=0.5,discount=0.9")
plt.plot(x_axis, ratio_of_wins_vs_grplayer2, color='g', label="lr=0.5,discount=0.5")
plt.plot(x_axis, ratio_of_wins_vs_grplayer3, color='b', label="lr=0.5,discount=0.1")
plt.plot(x_axis, ratio_of_wins_vs_grplayer4, color='c', label="lr=0.1,discount=0.9")
plt.plot(x_axis, ratio_of_wins_vs_grplayer5, color='m', label="lr=0.1,discount=0.5")
plt.plot(x_axis, ratio_of_wins_vs_grplayer6, color='y', label="lr=0.1,discount=0.1")

plt.xlabel("number of training games")
plt.ylabel("wins ratio")
plt.title("Harel's wins ratio vs GreedyRandomPlayer\nover 1,000 testing games\ntrained with reward each round")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
#########GRAPH 4##########
plt.plot(x_axis, ratio_of_wins_vs_srplayer1, color='r', label="lr=0.5,discount=0.9")
plt.plot(x_axis, ratio_of_wins_vs_srplayer2, color='g', label="lr=0.5,discount=0.5")
plt.plot(x_axis, ratio_of_wins_vs_srplayer3, color='b', label="lr=0.5,discount=0.1")
plt.plot(x_axis, ratio_of_wins_vs_srplayer4, color='c', label="lr=0.1,discount=0.9")
plt.plot(x_axis, ratio_of_wins_vs_srplayer5, color='m', label="lr=0.1,discount=0.5")
plt.plot(x_axis, ratio_of_wins_vs_srplayer6, color='y', label="lr=0.1,discount=0.1")

plt.xlabel("number of training games")
plt.ylabel("wins ratio")
plt.title("Harel's wins ratio vs SemiRandomPlayer\nover 1,000 testing games\ntrained with reward each round")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

