## Resumable Experiments with Compsyn

In [1]:
from __future__ import annotations

import time
from datetime import datetime

from compsyn.config import CompsynConfig
from compsyn.trial import Trial
from compsyn.vectors import WordToColorVector

### Configuration

In [2]:
COMPSYN_ROOT_DIR="/Volumes/LACIE/compsyn" # change to a path on your local system where you store compsyn files
config = CompsynConfig(
    work_dir=f"{COMPSYN_ROOT_DIR}/notebook_work_dir",
    jzazbz_array=f"{COMPSYN_ROOT_DIR}/jzazbz_array.npy",
    google_application_credentials=f"{COMPSYN_ROOT_DIR}/compsyn3-8cf6580619a9.json",
    driver_path="/usr/local/bin/geckodriver",
    driver_browser="Firefox",
)

trial = Trial(
    experiment_name="concreteness",
    trial_id="example-snippet",
    hostname="topside",
    trial_timestamp=datetime.utcnow().strftime("%Y-%m-%d")
)

print("\n", config)
print("\n", trial)

[1616466313] (compsyn.Trial)  INFO: work_dir: /Volumes/LACIE/compsyn/notebook_work_dir
[1616466313] (compsyn.Trial)  INFO: experiment: concreteness
[1616466313] (compsyn.Trial)  INFO: trial_id: example-snippet
[1616466313] (compsyn.Trial)  INFO: hostname: topside

 CompsynConfig
	jzazbz_array                   = /Volumes/LACIE/compsyn/jzazbz_array.npy
	google_application_credentials = /Volumes/LACIE/compsyn/compsyn3-8cf6580619a9.json
	driver_browser                 = Firefox
	driver_path                    = /usr/local/bin/geckodriver
	s3_bucket                      = None
	s3_region_name                 = None
	s3_endpoint_url                = None
	s3_access_key_id               = None
	s3_secret_access_key           = None
	log_level                      = 20
	log_file                       = None
	work_dir                       = /Volumes/LACIE/compsyn/notebook_work_dir

 Trial
	experiment_name = concreteness
	trial_id        = example-snippet
	hostname        = topside
	trial_time

We define some examples in code here, but larger scale experiments could use CSVs or JSON files to record what vectors should be created

In [3]:
# snippet of experiment comprised of only two labels, real experiments will have many more labels 
vectors_metadata = [
  {
    "label": "solid",
    "Bigram": "0",
    "Conc.M": "4.42",
    "Conc.SD": "0.81",
    "Unknown": "0",
    "Total": "26",
    "Percent_known": "1",
    "SUBTLEX": "998",
    "Dom_Pos": "Adjective"
  },
  {
    "label": "woolly",
    "Bigram": "0",
    "Conc.M": "3.96",
    "Conc.SD": "1.14",
    "Unknown": "1",
    "Total": "26",
    "Percent_known": "0.96",
    "SUBTLEX": "24",
    "Dom_Pos": "Adjective"
  }
]

We will create a `WordToColorVector` for each of the metadata entries

In [4]:
def run_experiment(trial: Trial, vectors_metadata: List[Dict[str, str]]) -> List[WordToColorVector]:
    start = time.time()
    vectors = list()
    for vector_metadata in vectors_metadata:
        label = vector_metadata["label"]
        w2cv = WordToColorVector(label=label, metadata=vector_metadata, trial=trial)
        w2cv.run()
        vectors.append(w2cv)
    print(f"run_experiment completed in {int(time.time() - start)} seconds")
    return vectors

vectors = run_experiment(trial, vectors_metadata) # takes ~3 minutes

[1616466323] (compsyn.fetch_image_urls)  INFO: 'solid': 100 search results. Extracting links from 0:100
[1616466405] (compsyn.search_and_download)  INFO: 98/100 images successfully downloaded for 'solid'
[1616466405] (compsyn.WordToColorVector.solid)  INFO: saved 10.3KiB pickle to /Volumes/LACIE/compsyn/notebook_work_dir/concreteness/vectors/unnamed/solid/w2cv.pickle
[1616466408] (compsyn.ImageAnalysis)  INFO: solid is being compressed.
[1616466414] (compsyn.fetch_image_urls)  INFO: 'woolly': 100 search results. Extracting links from 0:100
[1616466526] (compsyn.search_and_download)  INFO: 99/100 images successfully downloaded for 'woolly'
[1616466526] (compsyn.WordToColorVector.woolly)  INFO: saved 11.4KiB pickle to /Volumes/LACIE/compsyn/notebook_work_dir/concreteness/vectors/unnamed/woolly/w2cv.pickle
[1616466530] (compsyn.ImageAnalysis)  INFO: woolly is being compressed.
run_experiment completed in 213 seconds


Since `WordToColorVector.run` knows when to `save` and `load` results for the given trial, this process is resumable, as we see when we add an entry, and re-run the experiment, only the newest entry needs to download images, the other already saved objects are simply reloaded.

In [5]:
vectors_metadata.append(
    {
        "label": "hoover",
        "Bigram": "0",
        "Conc.M": "3.76",
        "Conc.SD": "1.23",
        "Unknown": "4",
        "Total": "29",
        "Percent_known": "0.86",
        "SUBTLEX": "162",
        "Dom_Pos": "0"
    }
)
vectors = run_experiment(trial, vectors_metadata)

[1616466533] (compsyn.WordToColorVector.solid)  INFO: 98 raw images already downloaded
[1616466533] (compsyn.load_pickle)  INFO: loaded pickle from /Volumes/LACIE/compsyn/notebook_work_dir/concreteness/vectors/unnamed/solid/w2cv.pickle
[1616466536] (compsyn.ImageAnalysis)  INFO: solid is being compressed.
[1616466536] (compsyn.WordToColorVector.woolly)  INFO: 99 raw images already downloaded
[1616466536] (compsyn.load_pickle)  INFO: loaded pickle from /Volumes/LACIE/compsyn/notebook_work_dir/concreteness/vectors/unnamed/woolly/w2cv.pickle
[1616466540] (compsyn.ImageAnalysis)  INFO: woolly is being compressed.
[1616466546] (compsyn.fetch_image_urls)  INFO: 'hoover': 100 search results. Extracting links from 0:100
[1616466627] (compsyn.search_and_download)  INFO: 92/100 images successfully downloaded for 'hoover'
[1616466627] (compsyn.WordToColorVector.hoover)  INFO: saved 10.8KiB pickle to /Volumes/LACIE/compsyn/notebook_work_dir/concreteness/vectors/unnamed/hoover/w2cv.pickle
[16164666

`Vector` subclasses will try to look good when printed, for easy inspection of state

In [7]:
for vector in vectors:
    print("\n", vector)


 WordToColorVector(solid)
	Trial
		experiment_name = concreteness
		trial_id        = example-snippet
		hostname        = topside
		trial_timestamp = 2021-03-23
	metadata:
		label                                    = solid
		Bigram                                   = 0
		Conc.M                                   = 4.42
		Conc.SD                                  = 0.81
		Unknown                                  = 0
		Total                                    = 26
		Percent_known                            = 1
		SUBTLEX                                  = 998
		Dom_Pos                                  = Adjective
	generated data:
		(raw images available)
		raw_image_urls   = 100
		rgb_dist         = ['9.65e-08', '2.34e-08', '8.16e-09', '2.75e-08', '2.55e-08', '1.92e-09', '1.44e-08', '2.85e-07']
		jzazbz_dist      = [77.313, 156.422, 7.83, 47.582, 75.257, 381.679, 15.67, 80.0]
		jzazbz_dist_std  = [166.635, 214.535, 45.044, 98.286, 120.541, 256.147, 63.794, 118.99]

 WordToColorVector(wooll