<a href="https://colab.research.google.com/github/derekchased/uu.seed/blob/master/Kiwi_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
# load feathers
import json
import urllib.request
import glob

# analysis
import pandas as pd
import seaborn as sns
import numpy as np


# Functions

## Helper Functions

In [2]:
def dfByName(name):
    return feathers[name]["df"]

def dfByIndex(index):
    if index < len(feathers):
        return feathers[list(feathers)[index]]["df"]

def dfNames():
    return list(feathers)

def nameBydf(df):
    for ind, f in enumerate(feathers):
        if feathers[f]["df"] is df:
            return f

def get_feathers(github=False):
    if github:
        feathers = get_feathers_github()
    else:
        feathers = get_feathers_local()
        pass
    
    # Read each feather file into Pandas format and store in the dict
    for key in feathers:
        feathers[key]["df"] = pd.read_feather(feathers[key]["path"])
    
    return feathers
        
def get_feathers_github():
    # Load json feed from github trees api containing path to feather files
    with urllib.request.urlopen("https://api.github.com/repos/derekchased/uu-thesis-feathers/git/trees/master?recursive=1") as url:
        data = json.load(url)

    # Parse the json response and store in a dict of dicts
    base = "https://github.com/derekchased/uu-thesis-feathers/blob/master/"
    querystring = "?raw=true"
    feathers = {d["path"].replace(".feather","").replace("feather/","") : {"path":base+d["path"]+querystring} for d in data["tree"] if ".feather" in d["path"]}
    
    return feathers

def get_feathers_local(path="../kiwi_data/feather"):
    g = glob.glob(path + "/*.feather")
    feathers = {d.split("/")[-1].replace(".feather","") : {"path":d} for d in g}
    return feathers

## Analysis Functions

In [3]:
def aggregate_df_successrate(df):
    aggregation = df.groupby(["experiment id",'name'], 
        as_index=False).agg(
        avg_success_rate=('success rate', np.mean),
        std_success_rate=('success rate', np.std)).sort_values(
        by="avg_success_rate", inplace=False, ascending=False).reset_index(
        drop=True)
    return aggregation

# Load Data
The data is stored as Pandas Dataframes in a [feather file format](https://arrow.apache.org/docs/python/feather.html). They are loaded to [github lfs](https://docs.github.com/en/repositories/working-with-files/managing-large-files/about-git-large-file-storage).

In [4]:
feathers = get_feathers(github=True)

# Experiments

## List of experiments

In [5]:
for ind,name in enumerate(dfNames()):
    print(f"{ind}\t{name}")

0	kiwi_1214_exp1_fullset_m0TOm4_ganset_m0TOm4_train
1	kiwi_1214_exp1_ganset_m0TOm9_gen10
2	kiwi_1214_exp1_ganset_m0TOm9_gen3
3	kiwi_1214_exp1_ganset_m0TOm9_gen4
4	kiwi_1214_exp1_ganset_m0TOm9_gen6
5	kiwi_1214_exp1_ganset_m0TOm9_gen9
6	kiwi_1214_exp1_ganset_m0TOm9_train
7	kiwi_1214_exp1_prunedset_m0TOm9_gen3_relaxed
8	kiwi_1214_exp1_prunedset_m0TOm9_gen6_relaxed
9	kiwi_1214_exp1_prunedset_m0TOm9_train
10	kiwi_1214_pruned_all_exp1_gen10
11	kiwi_1214_pruned_all_exp1_gen3
12	kiwi_1214_pruned_all_exp1_gen4
13	kiwi_1214_pruned_all_exp1_gen6
14	kiwi_1214_pruned_all_exp1_gen9


## Summarize an Experiment

In [6]:
# Set a df to analyze
# df = dfByIndex(6)
df = dfByName("kiwi_1214_exp1_fullset_m0TOm4_ganset_m0TOm4_train")
df.loc[:,"success rate"].describe()

count    1130.000000
mean        0.074407
std         0.098442
min         0.000000
25%         0.000000
50%         0.040000
75%         0.100000
max         0.670000
Name: success rate, dtype: float64

In [7]:
# Preview half the data frame
df.iloc[:,0:8].head()

Unnamed: 0,name,run,experiment id,success rate,avg steps success,std steps success,num episodes,file
0,gauss sm r90 g3 e4,m4,e1train,0.0,,,100,230106_031614_gauss_sm_r90_g3_e4_normal_m4_230...
1,gauss drc r50 g3 e3,m4,e1train,0.02,328.0,34.0,100,230106_124924_gauss_drc_r50_g3_e3_normal_m4_23...
2,uni drc r90,m3,e1train,0.01,321.0,0.0,100,230105_103124_uni_drc_r90_normal_m3_230203_015...
3,gauss drc drs r50 g3 e3,m2,e1train,0.04,287.25,47.614992,100,230105_031344_gauss_drc_drs_r50_g3_e3_normal_m...
4,basenrml r90,m2,e1train,0.07,308.857143,51.709155,100,230104_164602_basenrml_r90_normal_m2_230204_05...


In [8]:
# Preview the components of the data frame
df.iloc[:,8:-1].head()

Unnamed: 0,gauss,uni,sca,drc,drs,sm,ast,r50,r60,r70,r80,r90,r100,g3,e2,e3,e4
0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1
1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0
2,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [9]:
# View the columns (since it is a wide dataframe and some are hidden in head())
df.columns

Index(['name', 'run', 'experiment id', 'success rate', 'avg steps success',
       'std steps success', 'num episodes', 'file', 'gauss', 'uni', 'sca',
       'drc', 'drs', 'sm', 'ast', 'r50', 'r60', 'r70', 'r80', 'r90', 'r100',
       'g3', 'e2', 'e3', 'e4', 'e5'],
      dtype='object')

In [10]:
# Summary of Success Rate Data
df.iloc[:,:df.columns.get_loc("gauss")].describe()

Unnamed: 0,success rate,avg steps success,std steps success,num episodes
count,1130.0,824.0,824.0,1130.0
mean,0.074407,341.783967,83.947589,100.0
std,0.098442,92.572548,56.722956,0.0
min,0.0,171.5,0.0,100.0
25%,0.0,280.993182,36.25,100.0
50%,0.04,326.491935,85.359361,100.0
75%,0.1,387.9625,126.863617,100.0
max,0.67,735.0,258.5,100.0


In [11]:
# Summary of components
df.iloc[:,df.columns.get_loc("gauss"):-1].describe()

Unnamed: 0,gauss,uni,sca,drc,drs,sm,ast,r50,r60,r70,r80,r90,r100,g3,e2,e3,e4
count,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0,1130.0
mean,0.539823,0.176991,0.340708,0.371681,0.39823,0.393805,0.017699,0.252212,0.0,0.0,0.261062,0.243363,0.243363,0.539823,0.181416,0.185841,0.172566
std,0.498632,0.38183,0.474157,0.483468,0.48975,0.488809,0.131914,0.434475,0.0,0.0,0.439408,0.429302,0.429302,0.498632,0.385533,0.38915,0.378039
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# Summary of components as boolean
df.iloc[:,df.columns.get_loc("gauss"):-1].astype(bool).describe()

Unnamed: 0,gauss,uni,sca,drc,drs,sm,ast,r50,r60,r70,r80,r90,r100,g3,e2,e3,e4
count,1130,1130,1130,1130,1130,1130,1130,1130,1130,1130,1130,1130,1130,1130,1130,1130,1130
unique,2,2,2,2,2,2,2,2,1,1,2,2,2,2,2,2,2
top,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
freq,610,930,745,710,680,685,1110,845,1130,1130,835,855,855,610,925,920,935


## Aggregation of an Experiment

### Mean Success Rate and Std Dev Success Rate 

In [13]:
agg_df = aggregate_df_successrate(df)

In [14]:
agg_df[agg_df["avg_success_rate"] > 0]

Unnamed: 0,experiment id,name,avg_success_rate,std_success_rate
0,e1train,gauss sca drc r90 g3 e2,0.340,0.264102
1,e1train,sca drc r80,0.256,0.145190
2,e1train,uni sca sm r90,0.232,0.191755
3,e1train,gauss sca sm r80 g3 e3,0.226,0.098641
4,e1train,uni sca drc r90,0.224,0.100399
...,...,...,...,...
192,e1train,gauss drc sm r100 g3 e3,0.004,0.005477
193,e1train,gauss sm r80 g3 e4,0.002,0.004472
194,e1train,sm r100,0.002,0.004472
195,e1train,drc sm r100,0.002,0.004472


In [15]:
agg_df[agg_df['name'].str.contains("base")]

Unnamed: 0,experiment id,name,avg_success_rate,std_success_rate
40,e1train,base r100,0.13,0.084853
49,e1train,base r50,0.118,0.147547
54,e1train,basenrml r100,0.11,0.102225
102,e1train,base r90,0.07,0.051478
115,e1train,basenrml r90,0.06,0.027386
126,e1train,basenrml r80,0.054,0.032094
173,e1train,basenrml r50,0.018,0.0249


In [16]:
agg_df[agg_df['name'].str.contains("gauss sca drc")]

Unnamed: 0,experiment id,name,avg_success_rate,std_success_rate
0,e1train,gauss sca drc r90 g3 e2,0.34,0.264102
14,e1train,gauss sca drc r80 g3 e2,0.182,0.060166
27,e1train,gauss sca drc r50 g3 e4,0.156,0.125419
38,e1train,gauss sca drc r50 g3 e2,0.134,0.076681
42,e1train,gauss sca drc r80 g3 e4,0.126,0.079875
45,e1train,gauss sca drc r100 g3 e3,0.124,0.101143
53,e1train,gauss sca drc r80 g3 e3,0.11,0.051478
66,e1train,gauss sca drc r50 g3 e3,0.098,0.070498
73,e1train,gauss sca drc r100 g3 e2,0.094,0.099398


In [17]:
agg_df[agg_df['name'].str.contains("gauss sca drc(.*)g3 e2")]

  agg_df[agg_df['name'].str.contains("gauss sca drc(.*)g3 e2")]


Unnamed: 0,experiment id,name,avg_success_rate,std_success_rate
0,e1train,gauss sca drc r90 g3 e2,0.34,0.264102
14,e1train,gauss sca drc r80 g3 e2,0.182,0.060166
38,e1train,gauss sca drc r50 g3 e2,0.134,0.076681
73,e1train,gauss sca drc r100 g3 e2,0.094,0.099398
