# Default training Parameters:

In [48]:
from RLTrain import Opt
import os
import time
print(Opt())
import pandas as pd
from RadarGraph import *

import sys
sys.path.append('../')

policy                        			MlpPolicy
algorithm                     			PPO
environment                   			WurtzReact-v1
steps                         			51200
dir                           			<DEFAULT>
n_steps                       			256
n_envs                        			1
seed                          			None



# Training an agent with default parameters

In [None]:
t=time.time()
#os.system("python RLTrain.py steps=200000")
print(time.time()-t)

# Running inference with the trained models and saving [S,A,R,S] info

In [None]:
#os.system("python RLTest.py PPO_WurtzReact-v1 steps=500")
#os.system("python RLTest.py WRH algorithm=WRH steps=500")

In [None]:

calc_return = default_obj = lambda x: x.Reward.sum()/x.Done.sum()
worst_obj = lambda x: -x.Reward.sum()/x.Done.sum()


def load_rollouts(env: str,obj = default_obj,last: bool = True,verbose:bool=False,TOL:float = 1e-4):
    """Retrieve RL rollouts of each algorithm from the file system
    
    Parameters:
        obj (function) - Method to measure how good the rollout is
        env (str) - The environment you want rollouts from
        last (bool) - Use the rollout of the last timestep if true and the best performing timestep if false
    
    """
    
    folder = f"MODELS\\{env}" if len(env.split("\\"))==1 else env
    algoidx = [i for i,string in enumerate(folder.split("\\")) if "-v" in string][0]
    data = dict()
    rollName=["best_rollout","rollout"][last]
    for a,b,c in os.walk(folder):
        if rollName in c:
            algo = a.split("\\")[algoidx+1]
            df1 = pd.read_pickle(a+"\\"+rollName)
            if verbose:print(a,"|",0 if obj is None else obj(df1))
            if not algo in data:
                data[algo]=df1
            else:
                df0=data[algo]
                if obj is None:
                    data[algo]=pd.concat([df0,df1],ignore_index=True)  
                #prefer rollouts with more episodes when objective ranking is similar
                elif abs(obj(df1)-obj(df0))<TOL:
                    if verbose:print(df1.Done.sum(),df0.Done.sum(),df0.Done.shape)
                    if df1.Done.sum()>df0.Done.sum():
                        data[algo]=df1
                #prefer higher ranking ones
                elif obj(df1)>obj(df0):
                    data[algo]=df1
    return data


In [None]:
def target_subset(frame,N,i):
    """
    Filters the rollout for episodes which have a specific target
    Inputs:
        Frame (dataframe) - Pandas Dataframe containing gym information
        N (int)           - Number of targets in your environment
        i (int)           - The index of your target as it appears in the observation space
        
    Outputs:
        cframe (dataframe) - Subset of your Pandas dataframe with only episodes of target i
    
    """
    obs = np.stack(frame.InState)
    cframe=frame[obs[:,-N+i]>0.9]
    return cframe



def actions_by_time(frame):
    """Gives the mean action at each timestep of your rollout dataframe"""
    min_t,max_t = frame.Step.min(),frame.Step.max()
    mean_act=[]
    for t in range(min_t,max_t+1):
        mean_act+=[frame.Action[frame.Step==t].mean()]
    return np.array(mean_act)

In [None]:
def plotabt(all_act,actions,colors,names,title_ext="",title=None,ylim=(-0.15,1.15)):
    """Plotting Function for actions vs time
    Inputs:
        all_act (list<np.array>) -- shape is [number of models, number of timesteps, number of actions]
        actions (list<str>) -- list of the name of each action
        colors (list<str>) -- list of colors (should be at least as long as the number of models)
        names (list<str>) -- the names of each model
        title_ext (str) -- addition you want to add to the end of the graph title
    
    """
    fig, axs = plt.subplots(figsize=(9, 5), nrows=len(actions), ncols=len(all_act))
    fig.subplots_adjust(wspace=0.0, hspace=0.0, top=0.85, bottom=0.05)
    #i indexes the each action 
    for i,ax0 in enumerate(axs):
        #j indexes each model
        for j,act in enumerate(all_act):
            #Model name
            axs[0][j].set_title(names[j])
            ax=ax0[j]
            ax.plot(act[:,i],".-",color=colors[j],ms=3,alpha=0.8)
            ax.set_ylim(*ylim)
            if j!=0:
                ax.set_yticks([])
        #Action name name
        ax.text(act.shape[0],ylim[1]*0.5,actions[i],horizontalalignment="right",verticalalignment="top"
                ,bbox=dict(boxstyle="square",facecolor="w",edgecolor="k",alpha=0.8))
    if title is None:
        axs[0][(int(len(names)-0.5)//2)-1].text(2,2,"Average Value of Each Action vs Step %s"%title_ext)
    else:
        axs[0][(int(len(names)-0.5)//2)-1].text(2,2,title)
    #axs[0,-1].legend(names,loc=(0.8, .0))
    axs[-1,0].set_xlabel("Step")

# Gathering some Heuristics

In [None]:
import pandas as pd
ppo = pd.read_pickle("Legacy/Legacy5/MODELS/WurtzReact-V1/PPO/06-03-2023--18-21-33/best_rollout")
heuristic = pd.read_pickle("Legacy/Legacy5/MODELS\\WurtzReact-v1\\Heuristic/rollout")

print(ppo.keys())

info = [
    ['dT', 'dV', '1-chlorohexane', '2-chlorohexane', '3-chlorohexane', 'Na' ],
    ('PPO', [
        [a for a in ppo.Action.mean()],
        [ppo[ppo.Done==True].Reward.mean()]*6,
        [1,0,0,0,0,0]]),
    ('Heuristic', [
        [a for a in heuristic.Action.mean()],
        [heuristic[heuristic.Done==True].Reward.mean()]*6,
        [1,0,0,0,0,0]]),

]

In [None]:
print(heuristic[heuristic.Done==True].Reward.mean())

print(ppo[ppo.Done==True].Reward.mean())

# Plot as a Radar Graph

In [None]:
from RadarGraph import *


theta = radar_factory(len(info[0]), frame='polygon')


fig, axs = plt.subplots(figsize=(9, 9), nrows=1, ncols=2,subplot_kw=dict(projection='radar'))
fig.subplots_adjust(wspace=0.5, hspace=0.25, top=0.85, bottom=0.05)

make_radar(theta,axs,info,colors = "brk")

labels = ('Action Taken', 'Return', '1', 'Factor 4', 'Factor 5')
legend = axs[0].legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')


fig.text(0.5, 0.7, "Average value of each action and episodic return for a 500 step rollout (PPO trained for 200K steps)",
             horizontalalignment='center', color='black', weight='bold',
             size='large')

plt.show()

In [None]:

all_act = [actions_by_time(ppo),actions_by_time(heuristic)]
actions = ['dT', 'dV', '1-chlorohexane', '2-chlorohexane', '3-chlorohexane', 'Na' ]
colors = ["r","g","b","c","y","m"]
names=["PPO","Heuristic"]

plotabt(all_act,actions,colors,names,title_ext="(WurtzReact PPO on 0.5M steps)")

# Functions for Conditional Returns and Actions

In [None]:
from chemistrylab.reactions.available_reactions.chloro_wurtz import PRODUCTS as CWtargs

def get_conditional_rewards(frame,targets=CWtargs):
    """
    Gives returns conditioned on the different targets
    Inputs:
        Frame (dataframe) - Pandas Dataframe containing gym information
        targets (list) - List of N targets (reaction products)
        
    Outputs:
        targets
        rew (List<float>) - List of size N containing the average return given each target
    
    """
    # turn observation column into a numpy array
    obs = np.stack(frame.InState)
    N=len(targets)
    rew=[]
    for i in range(N):
        #gather all data where the target is targets[N]
        cframe=frame[obs[:,-N+i]>0.9]
        #Obtain the mean reward of these episodes
        rew+=[calc_return(cframe)]
    return [targets,np.array(rew)]

def get_conditional_actions(frame,targets=CWtargs):
    """
    Gives actions conditioned on the different targets, meant for continuous action spaces
    Inputs:
        Frame (dataframe) - Pandas Dataframe containing gym information
        targets (list) - List of N targets (reaction products)
        
    Outputs:
        targets
        act (List<array>) - List of size N containing the mean action given each target
    
    """
    # turn observation column into a numpy array
    obs = np.stack(frame.InState)
    N=len(targets)
    act=[]
    for i in range(N):
        #gather all data where the target is targets[N]
        cframe=frame[obs[:,-N+i]>0.9]
        #Obtain the mean action of these episodes
        act+=[cframe.Action.mean()]
    return [targets,act]


# General Wurtz React:

In [None]:
import pandas as pd


folders = load_rollouts("GenWurtzReact-v1",obj=default_obj,last=False)
models=[a for a in folders]

gppo = [folders[model] for model in models]
gheuristic = pd.read_pickle("Legacy\\Legacy5\\MODELS\\GenWurtzReact-v1\\Heuristic/rollout")

In [None]:
relative=True

if relative:
    ghr=get_conditional_rewards(gheuristic)[1]
    info = ([get_conditional_rewards(gheuristic)[0]]+
[(models[i], [get_conditional_rewards(gppo[i])[1]/ghr]) for i in range(len(models))]+
[('Heuristic', [get_conditional_rewards(gheuristic)[1]])])

else:
    info = ([get_conditional_rewards(gheuristic)[0]]+
[(models[i], [get_conditional_rewards(gppo[i])[1]]) for i in range(len(models))]+
[('Heuristic', [get_conditional_rewards(gheuristic)[1]])])

In [None]:
from RadarGraph import *


theta = radar_factory(len(info[0]), frame='polygon')

fig, axs = plt.subplots(figsize=(22, 7), nrows=1, ncols=len(models)+1,subplot_kw=dict(projection='radar'))
fig.subplots_adjust(wspace=0.5, hspace=0.25, top=0.85, bottom=0.05)

make_radar(theta,axs,info,colors = "r",gridlines=[0.0,0.4,0.8,1.2,1.6,2.0])

labels = ('Return', '-', '1', 'Factor 4', 'Factor 5')
legend = axs[0].legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')


fig.text(0.5, 0.8, "Average Return VS Target Material (Best Run trained with 0.5M Steps)",
             horizontalalignment='center', color='black', weight='bold',
             size='large')

if relative:
    #scale all but the heurstic the same
    for ax in axs.flat[:-1]:
        ax.set_rmin(min([a[0].min() for (c,a) in info[1:-1]]+[0]))
        ax.set_rmax(max([a[0].max() for (c,a) in info[1:-1]]+[1]))
else:
    #scale them all the same
    for ax in axs.flat:
        ax.set_rmin(min([a[0].min() for (c,a) in info[1:]]+[0]))
        ax.set_rmax(max([a[0].max() for (c,a) in info[1:]]))

plt.show()

In [None]:
info0 = ([
    ['dT', 'dV', '1-chlorohexane', '2-chlorohexane', '3-chlorohexane', 'Na' ]]+
    [(models[i], [[b for b in act] for act in get_conditional_actions(gppo[i])[1]]) for i in range(len(models))]+
    [('Heuristic', [[b for b in act] for act in get_conditional_actions(gheuristic)[1]])]

)

# Mean Action Given a Target Material

In [None]:
#for i in range(len(CWtargs)):
i = 6

info=[info0[0]]+[(md[0],md[1][i:i+1]) for md in info0[1:]]


theta = radar_factory(len(info[0]), frame='polygon')
fig, axs = plt.subplots(figsize=(22, 4), nrows=1, ncols=len(models)+1,subplot_kw=dict(projection='radar'))
fig.subplots_adjust(wspace=0.5, hspace=0.25, top=0.85, bottom=0.05)
c=['b', 'r', 'g', 'm', 'y',"orange","k"][i:i+1]
make_radar(theta,axs,info,colors=c)
labels = ["Target: "+CWtargs[i]]
legend = axs[0].legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')
fig.text(0.5, 1.0, "Mean Value of Actions VS Target Material (Trained with 0.5M Steps)",
             horizontalalignment='center', color='black', weight='bold',
             size='large')
plt.show()

# Mean action at each timestep (Same Target)

In [None]:
all_act = ([
actions_by_time(target_subset(gp,len(CWtargs),i)) for gp in gppo]
+[actions_by_time(target_subset(gheuristic,len(CWtargs),i))])
actions = ['dT', 'dV', '1-chlorohexane', '2-chlorohexane', '3-chlorohexane', 'Na' ]
colors = ["r","g","b","c","y","m"]
names=models+["Heuristic"]

plotabt(all_act,actions,colors,names,title_ext="(GenWurtzReact Targeting %s)"%CWtargs[i])

plt.savefig(f"Legacy/Figures/WurtzReact/Target-{CWtargs[i]}.pdf",bbox_inches='tight')

# Fict React:

Reactions at play:

A+B $\rightarrow$ E</br>
A+D $\rightarrow$ F</br>
B+D $\rightarrow$ G</br>
F+G $\rightarrow$ I</br>

In [None]:
from chemistrylab.reactions.available_reactions.fict_react2 import PRODUCTS as FRtargs
from chemistrylab.reactions.available_reactions.fict_react2 import REACTANTS as FRchoices

obj = lambda x: get_conditional_rewards(x,FRtargs)[1].mean()

#obj = lambda x: -get_conditional_rewards(x,FRtargs)[1].mean()

folders = load_rollouts("FictReact-v2",obj=obj,last=False,verbose=True)#\\PPO\\09-03-2023--15-14-59",last=False)
#folders = dict(PPO="MODELS/FictReact-v2/PPO\\09-03-2023--15-14-59\\rollout",
              #SAC="MODELS/FictReact-v2/SAC\\02-03-2023--15-55-48\\best_rollout",
              #A2C="MODELS/FictReact-v2/A2C\\02-03-2023--16-56-52\\best_rollout",
              #TD3="MODELS/FictReact-v2/TD3\\02-03-2023--18-04-00\\best_rollout")

models=[a for a in folders if a!="Heuristic"]

fppo = [folders[model] for model in models]

fheuristic = pd.read_pickle("MODELS/FictReact-v2/Heuristic/rollout")

In [None]:
relative=True

if relative:
    fhr=get_conditional_rewards(fheuristic,FRtargs)[1]
    info = ([get_conditional_rewards(fheuristic,FRtargs)[0]]+
[(models[i], [get_conditional_rewards(fppo[i],FRtargs)[1]/fhr]) for i in range(len(models))]+
[('Heuristic', [get_conditional_rewards(fheuristic,FRtargs)[1]])])

else:
    info = ([get_conditional_rewards(fheuristic,FRtargs)[0]]+
[(models[i], [get_conditional_rewards(fppo[i],FRtargs)[1]]) for i in range(len(models))]+
[('Heuristic', [get_conditional_rewards(fheuristic,FRtargs)[1]])])

In [None]:
theta = radar_factory(len(info[0]), frame='polygon')

fig, axs = plt.subplots(figsize=(22, 7), nrows=1, ncols=len(models)+1,subplot_kw=dict(projection='radar'))

#axs = np.array([axs])

fig.subplots_adjust(wspace=0.5, hspace=0.25, top=0.85, bottom=0.05)

make_radar(theta,axs,info,colors = "r",gridlines=[0.0,0.2,0.4,0.6,0.8])

labels = ('Return', '-', '1', 'Factor 4', 'Factor 5')
legend = axs[0].legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')


fig.text(0.5, 0.8, "Average Return VS Target Material (Best model trained with 0.5M Steps)",
             horizontalalignment='center', color='black', weight='bold',
             size='large')

if relative:
    #scale all but the heurstic the same
    for ax in axs.flat[:-1]:
        ax.set_rmin(max(0,min([a[0].min() for (c,a) in info[1:-1]]+[0])))
        ax.set_rmax(max([a[0].max() for (c,a) in info[1:-1]]+[1]))
else:
    #scale them all the same
    for ax in axs.flat:
        ax.set_rmin(min([a[0].min() for (c,a) in info[1:]]+[0]))
        ax.set_rmax(max([a[0].max() for (c,a) in info[1:]]))
        
#plt.savefig("Legacy/Figures/FictReactBest.pdf",bbox_inches='tight')
plt.show()

In [None]:
tmp=folders["TD3"]

obs = np.stack(tmp.InState)
tmp[obs[:,-1]>0.9].shape[0]/20
print(tmp[tmp.Done==True].Reward.mean(),calc_return(tmp))


# Mean Action VS Target Material

In [None]:
info0 = ([
    ["dT","dV"]+FRchoices]+
    [(models[i], [[b for b in act] for act in get_conditional_actions(fppo[i],FRtargs)[1]]) for i in range(len(models))]+
    [('Heuristic', [[b for b in act] for act in get_conditional_actions(fheuristic,FRtargs)[1]])]

)

In [None]:
#for i in range(len(FRtargs)):
i=0
info=[info0[0]]+[(md[0],md[1][i:i+1]) for md in info0[1:]]


theta = radar_factory(len(info[0]), frame='polygon')
fig, axs = plt.subplots(figsize=(22, 4), nrows=1, ncols=len(models)+1,subplot_kw=dict(projection='radar'))
fig.subplots_adjust(wspace=0.5, hspace=0.25, top=0.85, bottom=0.05)
c=['b', 'r', 'g', 'm', 'y',"orange","r"][i:i+1]
make_radar(theta,axs,info,colors=c)
labels = ["Target: "+FRtargs[i]]
legend = axs[0].legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')
fig.text(0.5, 1.0, "Mean Value of Actions VS Target Material (Trained with 0.5M Steps)",
             horizontalalignment='center', color='black', weight='bold',
             size='large')
plt.show()

# ... At each Timestep

In [None]:
i=3

all_act = ([
actions_by_time(target_subset(fp,len(FRtargs),i)) for fp in fppo]
+[actions_by_time(target_subset(fheuristic,len(FRtargs),i))])
colors = ["r","g","b","c","y","m"]
names=models+["Heuristic"]

plotabt(all_act,["dT","dV"]+FRchoices,colors,names,title_ext="(FictReact Targeting %s)"%FRtargs[i])

plt.savefig(f"Legacy/Figures/FictReact/Target-{FRtargs[i]}.pdf",bbox_inches='tight')


# Handling Box-Discrete actions

In [None]:
def get_discrete_actions(frame,N=None,N2=None):
    """
    Gives distribution of actions (index 0) taken as well as the average value of the actions at index 1
    Inputs:
        Frame (dataframe) - Pandas Dataframe containing gym information        
    Outputs:
        act0 (list<float>) - Action (index 0) distribution
        act1 (list(float)) - Average action at index 1
    
    """
    # turn observation column into a numpy array
    act = np.stack(frame.Action)    
    if len(act.shape)<2:
        act0=act
        act=np.zeros(act0.shape+(2,),dtype=np.int32)
        act[:,0]=act0//N2
        act[:,1] = act0%N2
        
    if N is None:
        N = np.max(act[:,0])
    N0= np.max(act[:,1])
    act0=[]
    act1=[]
    #print(N)
    for i in range(N+1):
        #gather all data where the target is targets[N]
        cframe=act[act[:,0]==i]
        
        #print(cframe)
        #Obtain the mean action of these episodes
        act0+=[len(cframe)/act.shape[0]]
        if len(cframe)==0:
            act1+=[0]
        else:
            act1+=[cframe[:,1].mean()/N0]
    return [act0,act1]

In [49]:
def discrete_actions_by_time(frame,N=None,N2=None):
    """Gives the mean action at each timestep of your rollout dataframe when actions are discrete
    
    Inputs:
        frame (dataframe) - Pandas Dataframe containing rollouts
        N (int)           - Number of actions in MultiDiscrete dim 0
        N2 (int)          - Number of actions in MultiDiscrete dim 1
    
    """
    min_t,max_t = frame.Step.min(),frame.Step.max()
    mean_act=[]
    
    act = np.stack(frame.Action)  

    
    if N is None:
        N = np.max(act[:,0])
    
    for t in range(min_t,max_t+1):
        act = np.stack(frame.Action[frame.Step==t])
        if len(act.shape)<2:
            act0=act
            act=np.zeros(act0.shape+(2,),dtype=np.int32)
            act[:,0]=act0//N2
            act[:,1] = act0%N2
        mean_act_t =np.zeros(N+1)
        for i in range(N+1):
            mean_act_t[i] = (act[:,0]==i).mean()
        mean_act+=[mean_act_t]
    return np.array(mean_act)


def hashed_trajectories(frame,N=None,N2=None):
    """Turns rollouts into a dictionary of trajectories and counts by hashing episodes based on the actions taken.
    
    Inputs:
        frame (dataframe) - Pandas Dataframe containing rollouts
        N (int)           - Number of actions in MultiDiscrete dim 0
        N2 (int)          - Number of actions in MultiDiscrete dim 1
    
    Outputs:
        trajectories (dict) - String representations of actions are keys and the number appearances are values
    
    Example: 
    
    >>> print(frame) 
    >>>         InState  Action  Reward  OutState   Done Info  Step
            0  0.631918  [0, 9]     0.0  0.632225  False   {}     0
            1  0.632225  [0, 9]     0.0    0.6319  False   {}     1
            2    0.6319  [4, 4]     0.8    0.6319   True   {}     2
            
    >>> hashed_trajectories(frame)
    >>> {'090944': 1}
    

    
    """
    min_t,max_t = frame.Step.min(),frame.Step.max()
    mean_act=[]
    
    act = np.stack(frame.Action)  

    trajectories=dict()
    
    if N is None:
        N = np.max(act[:,0])
    act_string=""
    for t,act in enumerate(frame.Action):
        if len(act.shape)<1:
            act0=int(act)
            act=np.zeros((2,),dtype=np.int32)
            act[0]=act0//N2
            act[1] = act0%N2
        act_string+=(str(act[0])+str(act[1]))
        
        if frame.Done[t]:
            trajectories[act_string] = trajectories.get(act_string,0)+1
            act_string=""
            
    return trajectories




In [64]:
example = pd.read_pickle("Legacy/Legacy5/MODELS\\WurtzDistill-v1\\Heuristic\\rollout")[:3]
for i,s in enumerate(example.InState):
    example.InState[i]=s.mean()
for i,s in enumerate(example.OutState):
    example.OutState[i]=s.mean()
    
print(example)
hashed_trajectories(example)

    InState  Action  Reward  OutState   Done Info  Step
0  0.631918  [0, 9]     0.0  0.632225  False   {}     0
1  0.632225  [0, 9]     0.0    0.6319  False   {}     1
2    0.6319  [4, 4]     0.8    0.6319   True   {}     2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example.InState[i]=s.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example.OutState[i]=s.mean()


{'090944': 1}

# Distillation Bench Results

In [None]:
folders = load_rollouts("WurtzDistill-v1",obj=None,last=False,verbose=False)

#folders.update(load_rollouts("DiscreteWurtzDistill-v1",obj=default_obj,last=True))

models = [a for a in folders]
dppo = [folders[model] for model in models]
dheuristic = pd.read_pickle("Legacy/Legacy5/MODELS\\WurtzDistill-v1\\Heuristic\\rollout")
#dheuristic = pd.read_pickle("FR2H/rollout")

In [None]:
info = ([['dT', 'Pour 0->1', 'Pour 1->2', 'Wait', 'End Experiment' ]]+
[(models[i], get_discrete_actions(dppo[i],N=4,N2=10)[::-1]+
 [[default_obj(dppo[i])]*5]
 ) for i in range(len(models))]+
[('Heuristic', get_discrete_actions(dheuristic,N=4)[::-1]
+[[dheuristic[dheuristic.Done==True].Reward.mean()]*5])])

In [None]:
theta = radar_factory(len(info[0]), frame='polygon')
fig, axs = plt.subplots(figsize=(22, 4), nrows=1, ncols=len(models)+1,subplot_kw=dict(projection='radar'))
fig.subplots_adjust(wspace=0.5, hspace=0.25, top=0.85, bottom=0.05)
c=['b', 'r', 'g', (0,0.1,0), 'y',"orange","r"]#[i:i+1]
make_radar(theta,axs,info,colors=c)
labels = ( 'Action average sub-value','Percent Action Taken',"Average Return")
legend = axs[0].legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')
fig.text(0.5, 1.0, "Comparison of the frequency of actions taken (Trained with 50K Steps)",
             horizontalalignment='center', color='black', weight='bold',
             size='large')



#scale them all the same
for ax in axs:
    ax.set_rmin(0)
    ax.set_rmax(1)

plt.show()

In [None]:
if False:
    all_act = ([
    discrete_actions_by_time(dp,N=4,N2=10) for dp in dppo]
    +[discrete_actions_by_time(dheuristic,N=4,N2=10)])
    colors = ["r","g","b","c","y","m"]
    names=models+["Heuristic"]
    actions=['dT', 'Pour 0->1', 'Pour 1->2', 'Wait', 'End Experiment' ]
    plotabt(all_act,actions,colors,names,title="Frequency of each action type VS step (WurtzDistill)")

In [None]:
def distill_relabel(trajectory):
    
    B1_full=False
    
    pouring_actions = {"1","2"}
    
    wait_string = "31"
    
    new_trajectory=""
    
    for action,param in zip(trajectory[::2],trajectory[1::2]):
        
        #standardize the end experiment actions
        if action == "4":param="5"
        
        
        if param=="0":
            #Pouring zero amount is the same as waiting
            if action in pouring_actions:
                new_trajectory+=wait_string
            else:
                new_trajectory+=action+param
        else:
            if action == "1": #Pouring 0->1
                B1_full=True
                new_trajectory+=action+param
            if action == "0": #Potentially Boiling from 0->1
                B1_full=True
                new_trajectory+=action+param
            elif action=="2" and not B1_full: #Pouring from 1->2 won't do anything
                #draining stuff from empty B2
                new_trajectory+=wait_string
            else:
                new_trajectory+=action+param
    return new_trajectory

def distill_redo_dict(trajectories):
    new_traj=dict()
    for key in trajectories:
        key2 = distill_relabel(key)
        new_traj[key2]=new_traj.get(key2,0)+trajectories[key]
    return new_traj

In [None]:
folders = load_rollouts("WurtzDistill-v1",obj=None,last=False,TOL=1e-2)
models = [a for a in folders]
dppo = [folders[model] for model in models]

In [None]:



actions = ['dT', 'Pour 0->1', 'Pour 1->2', 'Wait', 'End Experiment' ]
toact = lambda x: "|".join([actions[int(i)]+":"+x[2*j+1] for j,i in enumerate(x[::2])])
tograph = lambda x: [int(i)*10+int(x[2*j+1]) for j,i in enumerate(x[::2])]

i=0
print(models[i])
hashes = hashed_trajectories(dppo[i],N=4,N2=10)

hashes=distill_redo_dict(hashes)

#hashes={"090945":1}

sorted_hashes = sorted([a for a in hashes],key=lambda x:hashes[x],reverse=True)
sorted_amounts = [hashes[x] for x in sorted_hashes]

L=max(12,len(sorted_hashes[0])//2)

print(sorted_hashes[0],sorted_amounts[0],sum(sorted_amounts))

toact(sorted_hashes[0])

tograph = lambda x: [int(i)*10+float(x[2*j+1])*0.78+(9-9*.78)/2 for j,i in enumerate(x[::2])]

fig = plt.figure(1,figsize=(7,3), dpi=240, facecolor='w', edgecolor='k')

for j,act in enumerate(actions[::-1]):
    j=len(actions)-j-1
    plt.fill_between([-0.5,L],[j*10-0.5,j*10-0.5],[j*10+9.5,j*10+9.5],label=act,alpha=0.5)
    plt.text(L*0.9875,j*10+5,act,horizontalalignment="right",bbox=dict(boxstyle="square",facecolor="w",edgecolor="k",alpha=0.2))
    
for a,string in enumerate(sorted_hashes):
    if len(string)<26 or a<10:
        plt.plot(tograph(string),"k.-",ms=5,alpha=(sorted_amounts[a]/sorted_amounts[0])**1)

        

plt.xlabel("Step")
plt.ylabel("Action")
plt.title(models[i]+ " Most Common Trajectories (Distillation Bench: All 10 runs, Best Checkpoint)")

plt.xlim(-0.5,L)
plt.ylim(-0.5,49.5)
plt.yticks([1,4.5,8,11,18,21,28,31,38],[-1,0,1,0,1,0,1,0,1])


#plt.savefig(f"Legacy/figures/Distillation/{models[i]}-{sorted_hashes[0][:20]}-{sorted_amounts[0]}.pdf",bbox_inches='tight')


plt.show()
#plt.legend()

# Extraction Bench Results

In [None]:
#folders = load_rollouts("Legacy\\Legacy5\\MODELS\\DiscreteWurtzExtract-v1",obj=default_obj,last=False,TOL=0)

folders = load_rollouts("MODELS\\DiscreteWurtzExtract-v1",obj=default_obj,last=False,TOL=0,verbose=False)

folders.update(load_rollouts("Legacy\\Legacy5\\MODELS\\DiscreteWurtzExtract-v1\\PPO-XL",obj=default_obj,last=False,TOL=0))

models = [a for a in folders]
eppo = [folders[model] for model in models]

eheuristic = pd.read_pickle("Legacy/Legacy5/MODELS\\WurtzExtract-v1\\Heuristic\\rollout")

#action_set = ['Draining from ExV to Beaker1', 'Mix ExV', "Mix B1", "Mix B2", "Pour from B1 to ExV", "Pour from B1 to B2",
#              'Pour from ExV to B2', 'Add oil, pour from Oil Vessel to ExV', 'wait', 'Done']

action_set=["Drain EV to B1", "Mix EV","Pour B1 into EV","Pour B2 into EV", 
            "Pour EV into B2", "Pour S1 into EV", "Pour S2 into EV","End Experiment"]


N = len(action_set)-1
info = ([action_set]+
[(models[i], get_discrete_actions(eppo[i],N,5)[::-1]+
 [[eppo[i][eppo[i].Done==True].Reward.mean()]*(N+1)]
 ) for i in range(len(models))]+
[('Heuristic', get_discrete_actions(eheuristic,N)[::-1]
+[[eheuristic[eheuristic.Done==True].Reward.mean()]*(N+1)])])

In [None]:
theta = radar_factory(len(info[0]), frame='polygon')
fig, axs = plt.subplots(figsize=(22, 4), nrows=1, ncols=len(models)+1,subplot_kw=dict(projection='radar'))
fig.subplots_adjust(wspace=0.5, hspace=0.25, top=0.85, bottom=0.05)
c=['b', 'r', 'g', 'm', 'y',"orange","r"]#[i:i+1]
make_radar(theta,axs,info,colors=c)
labels = ( 'Action average sub-value','Percent Action Taken',"Average Return")
legend = axs[0].legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')
fig.text(0.5, 1.0, "Comparison of the frequency of actions taken (Trained with 2M Steps)",
             horizontalalignment='center', color='black', weight='bold',
             size='large')

for ax in axs.flat:
    ax.set_rmin(min([np.array(a).min() for (c,a) in info[1:]]+[0]))
    ax.set_rmax(max([np.array(a).max() for (c,a) in info[1:]]+[1]))
plt.show()

In [None]:
actions=["Drain EV to B1", "Mix EV","Pour B1 into EV","Pour B2 into EV", 
            "Pour EV into B2", "Pour S1 into EV", "Pour S2 into EV","End Experiment"]
N = len(actions)-1

all_act = ([
discrete_actions_by_time(ep,N,5) for ep in eppo]
+[discrete_actions_by_time(eheuristic,N,5)])
colors = ["r","g","b","c","y","m"]+["k"]*10
names=models+["Heuristic"]

plotabt(all_act,actions,colors,names,title="(Normalized) Number of times Each Action was taken at each Step (WurtzExtract)")

In [None]:
def extract_relabel(trajectory):
    B2_full=False
    
    B1_full=False
    
    pouring_actions = {"0","2","3","4","5","6"}
    
    wait_string = "82"
    
    new_trajectory=""
    
    for action,param in zip(trajectory[::2],trajectory[1::2]):
        
        #standardize the end experiment actions
        if action=="7":param="2"
        
        if param=="0":
            #Pouring zero amount is the same as waiting
            if action in pouring_actions:
                new_trajectory+=wait_string
            else:
                new_trajectory+=action+param
        else:
            if action == "4": #Pouring stuff into B2
                B2_full=True
                new_trajectory+=action+param
            elif action=="3" and not B2_full:
                #draining stuff from empty B2
                new_trajectory+=wait_string
            elif action == "0": #Pouring stuff into B1
                B1_full=True
                new_trajectory+=action+param
            elif action=="2" and not B1_full:
                #draining stuff from empty B1
                new_trajectory+=wait_string
            else:
                new_trajectory+=action+param
    return new_trajectory

def extract_redo_dict(trajectories):
    new_traj=dict()
    for key in trajectories:
        key2 = extract_relabel(key)
        new_traj[key2]=new_traj.get(key2,0)+trajectories[key]
    return new_traj

In [None]:
actions=["Drain EV to B1", "Mix EV","Pour B1 into EV","Pour B2 into EV", 
            "Pour EV into B2", "Pour S1 into EV", "Pour S2 into EV","End Experiment","Wait"]

toact = lambda x: "|".join([actions[int(i)]+":"+x[2*j+1] for j,i in enumerate(x[::2])])

i=3
print(models[i])
hashes = hashed_trajectories(eppo[i],N=8,N2=5)

hashes = {"518282820104040404"+"5161828282010404"+"41618282820304043472":1}

hashes=extract_redo_dict(hashes)

sorted_hashes = sorted([a for a in hashes],key=lambda x:hashes[x],reverse=True)
sorted_amounts = [hashes[x] for x in sorted_hashes]


print(sorted_hashes[0],sorted_amounts[0],sum(sorted_amounts))

fig = plt.figure(1,figsize=(7,4), dpi=240, facecolor='w', edgecolor='k')

tograph = lambda x: [int(i)*10+float(x[2*j+1])*1.58+(9-9*.78)/2 for j,i in enumerate(x[::2])]

L=max(8,len(sorted_hashes[0])//2+4)

for j,act in enumerate(actions[::-1]):
    j=len(actions)-j-1
    plt.fill_between([-0.5,L],[j*10-0.5,j*10-0.5],[j*10+9.5,j*10+9.5],label=act,alpha=0.5)
    plt.text(L*0.9875,j*10+5,act,horizontalalignment="right",bbox=dict(boxstyle="square",facecolor="w",edgecolor="k",alpha=0.2))
    
for a,string in enumerate(sorted_hashes[:1]):
    #string = extract_relabel(string)
    if len(string)<40 or a<10:
        plt.plot(tograph(string),"k.-",ms=5,alpha=(sorted_amounts[a]/sorted_amounts[0])**1.0)

plt.xlabel("Step")
plt.ylabel("Action")
#plt.title(models[i]+ " Most Common Trajectories (Extraction Bench: Best Checkpoint of 10 runs)")

plt.title("Extract Policy (0.59 reward)")

#plt.title("91% purity steps")
plt.xlim(-0.5,L)
plt.ylim(-0.5,89.5)
plt.yticks([1,8,11,18,21,28,31,38,41,48,51,58,61,68],[0,1,0,1,0,1,0,1,0,1,0,1,0,1])
#plt.yticks([])
#plt.savefig(f"Legacy/figures/DiscreteExtraction/{models[i]}-{sorted_hashes[0][:20]}-{sorted_amounts[0]}.pdf",bbox_inches='tight')

plt.savefig(f"Legacy/figures/DiscreteExtraction/Heuristic.pdf",bbox_inches='tight')

plt.show()
#plt.legend()

In [None]:
frame = eppo[0]

frame.Action[frame.Step==0][0]

In [None]:
1/0

# React Bench Training

In [None]:
import os
for algo in ["SAC","A2C","PPO","TD3"]:
    for seed in [101,201,301]:
        os.system("python RLTrain.py algorithm=%s seed=%d environment=GenWurtzReact-v1 steps=50000 n_envs=10 best_ratio=0.0 best_episodes=1"%(algo,seed))

In [None]:
import os
for algo in ["PPO","SAC","A2C","TD3"]:
        for seed in [101,201,301]:
            os.system("python RLTrain.py algorithm=%s seed=%d environment=FictReact-v2 steps=50000 n_envs=10 best_ratio=0.0 best_episodes=1"%(algo,seed))

In [None]:
            os.system("python RLTrain.py algorithm=%s seed=%d environment=FictReact-v2 steps=50000 n_envs=10 best_ratio=0.0 best_episodes=1"%(algo,seed))

# Extraction and Distillation Bench Training

In [None]:
import os 
for seed in [101,201,301]:
    os.system("python RLTrain.py algorithm=DQN seed=%d environment=DiscreteWurtzExtract-v1 steps=50000 n_envs=10 best_ratio=0.5 best_episodes=200"%(seed))
    
for seed in [401,501,601]:
    os.system("python RLTrain.py algorithm=DQN seed=%d environment=DiscreteWurtzExtract-v1 steps=200000 n_envs=10 best_ratio=0.5 best_episodes=200"%(seed))
    


In [None]:
import os
for seed in [101,201,301]:
    os.system("python RLTrain.py algorithm=DQN seed=%d environment=DiscreteWurtzDistill-v1 steps=50000 n_envs=10 best_ratio=1e-5 best_episodes=1"%(seed))

In [None]:
import os
#from RLTrain import ALGO
#print (ALGO)
for algo in ['A2C']:#,'PPO'
    for seed in [301]:
        os.system("python RLTrain.py algorithm=%s seed=%d environment=WurtzExtract-v1 steps=50000 n_envs=10 best_ratio=0.0 best_episodes=1"%(algo,seed))
        os.system("python RLTrain.py algorithm=%s seed=%d environment=WurtzDistill-v1 steps=50000 n_envs=10 best_ratio=0.0 best_episodes=1"%(algo,seed))

In [None]:
for algo in ['PPO', 'A2C']:
    os.system("python RLTrain.py algorithm=%s environment=WurtzExtract-v1 steps=50000"%algo)

In [None]:
algo = "A2C"
os.system("python RLTrain.py algorithm=%s environment=WurtzDistill-v1 steps=50000"%algo)

# Evaluation

In [None]:

import os

for a,b,c in os.walk("./MODELS"):
    #print(a)
    if "-v" in a and "2023" in a:
        print(a[2:])
        os.system("python RLTest.py %s steps=5000"%a)

In [None]:
os.system("python RLTest.py WDH environment=WurtzDistill-v1 algorithm=WDH steps=5000")

In [None]:
os.system("python RLTest.py MODELS\\GenWurtzReact-v1\\Heuristic environment=GenWurtzReact-v1 algorithm=WRH steps=5000")

In [None]:
for a in ["PPO_WurtzExtract-v1","A2C_WurtzExtract-v1"]:
    os.system("python RLTest.py %s steps=5000"%a)

In [None]:
import os
os.system("python RLTest.py MODELS\\WurtzReact-v1\\Heuristic environment=WurtzReact-v1 algorithm=WRH steps=5000")

# TODO:

Check this out https://github.com/yuanmingqi/rl-exploration-baselines

In [None]:
import os
os.system("python RLTest.py PPO_RE3_WurtzExtract-v1 steps=5000")

In [None]:
import os

for a,b,c in os.walk("./"):
    #print(a)
    if "2023" in a and not "Legacy" in a:
         os.system("python RLTest.py %s steps=5000 --best"%a)

In [None]:
import os

for seed in range(1,4):
    os.system("python RLTrain.py environment=DiscreteWurtzExtract-v1 algorithm=DQN n_envs=10 steps=200000 best_episodes=200 best_ratio=0.5 seed=%d01"%seed)