# Visualizing the Hyperparameter Study

In [1]:
import os
try:
    import boto3

    s3 = boto3.resource(
        "s3",
        region_name="us-east-1",
        aws_secret_access_key=os.getenv("AWS_SECRET_AK"),
        aws_access_key_id=os.getenv("AWS_AK"),
    )
    bucket = s3.Bucket("ids703-nlp-finalproject")
    SAVE_TO_S3 = True
    print("[INFO] S3 connection successful.")
except:
    print("[ERROR] Could not connect to S3! Only saving locally.")
    SAVE_TO_S3 = False


[INFO] S3 connection successful.


In [5]:
import optuna
from optuna.visualization import plot_parallel_coordinate
bucket.download_file("artefacts/tf_hyperparameter_study_real.db.experiment", "../artefacts/tf_hyperparameter_study_real.db")
study = optuna.load_study(study_name="tf_study001", storage="sqlite:///../artefacts/tf_hyperparameter_study_real.db")
plot_parallel_coordinate(study)

In [4]:
dir(study)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_ask',
 '_is_multi_objective',
 '_log_completed_trial',
 '_optimize_lock',
 '_pop_waiting_trial_id',
 '_stop_flag',
 '_storage',
 '_study_id',
 '_tell',
 'add_trial',
 'add_trials',
 'ask',
 'best_params',
 'best_trial',
 'best_trials',
 'best_value',
 'direction',
 'directions',
 'enqueue_trial',
 'get_trials',
 'optimize',
 'pruner',
 'sampler',
 'set_system_attr',
 'set_user_attr',
 'stop',
 'study_name',
 'system_attrs',
 'tell',
 'trials',
 'trials_dataframe',
 'user_attrs']

In [4]:
df = study.trials_dataframe()
df.sort_values(by='value', ascending=False).head(15)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_dropout_rate,params_embedding_dim,params_hidden_dense_dim,params_hidden_size,params_l2_reg,state
25,25,0.959143,2021-12-09 08:17:18.315207,2021-12-09 08:18:03.842860,0 days 00:00:45.527653,0.05029,6,8,8,3.607221e-07,COMPLETE
36,36,0.959143,2021-12-09 08:28:49.708562,2021-12-09 08:29:35.472703,0 days 00:00:45.764141,0.02176,6,8,8,2.447286e-06,COMPLETE
47,47,0.958786,2021-12-09 08:36:15.351088,2021-12-09 08:36:45.527458,0 days 00:00:30.176370,0.052554,6,8,5,4.20333e-08,COMPLETE
41,41,0.9585,2021-12-09 08:32:46.788529,2021-12-09 08:33:23.728641,0 days 00:00:36.940112,0.086796,6,7,5,2.615354e-06,COMPLETE
7,7,0.958429,2021-12-09 08:00:44.755692,2021-12-09 08:01:54.080176,0 days 00:01:09.324484,0.116723,6,4,8,2.3377e-06,COMPLETE
37,37,0.958429,2021-12-09 08:29:35.492797,2021-12-09 08:30:12.027124,0 days 00:00:36.534327,0.025751,6,8,5,6.405944e-05,COMPLETE
22,22,0.958143,2021-12-09 08:14:23.957767,2021-12-09 08:15:09.479959,0 days 00:00:45.522192,0.149637,6,7,7,7.993687e-06,COMPLETE
11,11,0.958,2021-12-09 08:04:16.037983,2021-12-09 08:05:14.939059,0 days 00:00:58.901076,0.113525,6,6,8,1.201905e-07,COMPLETE
42,42,0.957929,2021-12-09 08:33:23.748972,2021-12-09 08:34:00.363994,0 days 00:00:36.615022,0.068669,6,8,5,2.489374e-06,COMPLETE
17,17,0.957857,2021-12-09 08:10:15.212714,2021-12-09 08:11:08.657866,0 days 00:00:53.445152,0.212673,6,6,7,4.240627e-06,COMPLETE


# LR Range Test

In [7]:
#%%
from numpy.core.numeric import False_
import optuna
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.layers.wrappers import Bidirectional
import torch.nn as nn
import joblib
from preprocessing_helpers import *
from data_collecting import hashtags
from tensorflow import keras
import os

try:
    import boto3

    s3 = boto3.resource(
        "s3",
        region_name="us-east-1",
        aws_secret_access_key=os.getenv("AWS_SECRET_AK"),
        aws_access_key_id=os.getenv("AWS_AK"),
    )
    bucket = s3.Bucket("ids703-nlp-finalproject")
    SAVE_TO_S3 = True
    print("[INFO] S3 connection successful.")
except:
    print("[ERROR] Could not connect to S3! Only saving locally.")
    SAVE_TO_S3 = False


#%%
# Load data from disk
encoder = joblib.load("../artefacts/encoder.pickle")

# TODO: train on synth_data first
synth_train = pd.read_parquet("../data/synth_train.parquet")
synth_val = pd.read_parquet("../data/synth_val.parquet")
synth_test = pd.read_parquet("../data/synth_test.parquet")

xtrain, ytrain = encode_dataframe(encoder, data=synth_train, mode="pytorch")
xval, yval = encode_dataframe(encoder, data=synth_val, mode="pytorch")
xtest, ytest = encode_dataframe(encoder, data=synth_test, mode="pytorch")

# Pad my input sequence with zeros
xtrain = nn.utils.rnn.pad_sequence(sequences=xtrain, batch_first=True, padding_value=0.0)
xval = nn.utils.rnn.pad_sequence(sequences=xval, batch_first=True, padding_value=0.0)
xtest = nn.utils.rnn.pad_sequence(sequences=xtest, batch_first=True, padding_value=0.0)

#%%
BATCH_SIZE = 64
LEARNING_RATE = 10 ** -2.5
NUM_EPOCHS = 1


#%%
from tf_hyperparameter_tuning import get_compiled_model

FINAL_PARAMS = {
    "embedding_dim": 2 ** 5,
    "hidden_size": 2 ** 6,
    "hidden_dense_dim": 2 ** 6,
    "dropout_rate": 0.1,
    "l2_reg": 0,
}

model = get_compiled_model(**FINAL_PARAMS, learning_rate=LEARNING_RATE)

#%%
# ----------------------------------------- Synthetic Data -----------------------------------------
# TODO: change to synth!
synth_train_dataset = tf.data.Dataset.from_tensor_slices(
    (xtrain, ytrain.cat.codes.values)
).batch(BATCH_SIZE)
synth_val_dataset = tf.data.Dataset.from_tensor_slices((xval, yval.cat.codes.values)).batch(
    BATCH_SIZE
)

from learningrate_rangetest import LRFinder

lr_finder = LRFinder(model)
lr_finder.find(
    synth_train_dataset,
    start_lr=0.0001,
    end_lr=1,
    batch_size=BATCH_SIZE,
    epochs=1,
    steps_per_epoch=np.ceil(len(xtrain) / BATCH_SIZE),
)

lr_finder.plot()

NameError: name '_C' is not defined