dumpmemory · pull · Aug 25, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 25, 2024
diff --git a/adaptive_low_rank/data.py b/adaptive_low_rank/data.py
@@ -172,22 +172,23 @@ def get_test_and_validation_dataset(
     target_key,
     validation_set_size = 256,
     test_set_size = 1024,
+    download_data = True,
 ):
   """Retrieves specified test and validation datasets."""
   try:
     test_ds = tfds.load(
         dataset_name,
         split="test",
         shuffle_files=False,
-        download=False,
+        download=download_data,
         data_dir=_DATA_DIR,
     )
   except ValueError:
     train_ds = tfds.load(
         dataset_name,
         split="train",
         shuffle_files=False,
-        download=False,
+        download=download_data,
         data_dir=_DATA_DIR,
     )
     test_ds = _carve_test_dataset(
@@ -233,21 +234,22 @@ def get_dataset(
     text_key,
     target_key,
     test_set_size = 1024,
+    download_data = True,
 ):
   """Retrieves specified train and test datasets."""
   train_ds = tfds.load(
       dataset_name,
       split="train",
       shuffle_files=True,
-      download=False,
+      download=download_data,
       data_dir=_DATA_DIR,
   )
   try:
     test_ds = tfds.load(
         dataset_name,
         split="test",
         shuffle_files=False,
-        download=False,
+        download=download_data,
         data_dir=_DATA_DIR,
     )
   except ValueError:

diff --git a/adaptive_low_rank/model.py b/adaptive_low_rank/model.py
@@ -43,15 +43,26 @@
 _TOKENIZER_BASE_PATH = ""
 
 
+def get_pretrained_tokenizer(
+    tokenizer_path,
+    local_files_only = False,
+):
+  """Obtains pretrained tokenizer."""
+  return transformers.AutoTokenizer.from_pretrained(
+      tokenizer_path,
+      local_files_only=local_files_only,
+  )
+
+
 def get_model_tokenizer_path_from_name(
     model_name, get_tokenizer = False
 ):
   """Gets model or tokenizer path from model name."""
   base_path = _TOKENIZER_BASE_PATH if get_tokenizer else _MODEL_BASE_PATH
   if model_name == "bert":
-    return base_path + "bert_base_cased"
+    return base_path + "bert-base-cased"
   elif model_name == "roberta":
-    return base_path + "roberta_base"
+    return base_path + "roberta-base"
   else:
     raise ValueError(f"Unsupported model: {model_name}")
 

diff --git a/adaptive_low_rank/requirements.txt b/adaptive_low_rank/requirements.txt
@@ -1,5 +1,7 @@
-tensorflow_text>=2.16.1
-tensorflow>=2.16.1
-transformers>=4.34.1
-numpy>=1.23.5
+tensorflow_text>=2.15,<2.16
+tensorflow>=2.15,<2.16
+transformers>=4.34.1,<=4.37
+numpy>=1.23.5,<=1.26
 absl-py>=1.2.0
+clu>=0.0.12,<0.1
+tensorflow-datasets>=4.8,<4.10
diff --git a/adaptive_low_rank/train.py b/adaptive_low_rank/train.py
@@ -221,9 +221,8 @@ def train(strategy):
 
     trainable_weights = prediction_model.trainable_variables
     logging.info(
-        "***Model trainable weights***: %d, %s",
+        "***Number of Model trainable weights***: %d",
         len(trainable_weights),
-        trainable_weights,
     )
 
     @tf.function

diff --git a/automated_feature_engineering/GithubAutomatedFeatureEngineering_Demo.ipynb b/automated_feature_engineering/GithubAutomatedFeatureEngineering_Demo.ipynb
diff --git a/automated_feature_engineering/README.md b/automated_feature_engineering/README.md
@@ -0,0 +1,70 @@
+# Automated Feature Engineering (AFE)
+
+How can we simplify and automate the complex and time-consuming process of feature engineering for machine learning models, allowing users to focus on model development and analysis while improving their model's performance and uncovering hidden insights in their data? Our goal is to develop an automated feature engineering system that can identify which engineered features can be optimal for a downstream ML task. To obtain an output such as:
+
+![](featureImage.png)
+
+AFE uses a set of feature importance masks to learn which features best benefit from which transformations. It uses local and global masking to determine the best transform functions for each feature, as well as which transformed features lead to an optimal performance with respect to the downstream task. Further details can be found in our papers on [automated feature engineering](https://arxiv.org/pdf/2406.04153) and [feature selection](https://arxiv.org/pdf/2304.03202).
+
+
+[A notebook demonstrating AFE](GithubAutomatedFeatureEngineering_Demo.ipynb) on Google Cloud. This repository has built-in integration with Google Cloud in terms of data retrieval, processing, and artifacts storage.
+
+It is easy to get started with AFE on the command line, for example, after 
+running `pip3 install -r requirements.txt`, we can run:
+```
+python3 trainer.py --project_id="gcp_project_id" --dataset_name=housingPrice --train_table_name=airbnb2023_float --target=price --task_type=regression --num_steps=10 --model_type=discovery --upload_features_to_bq=True
+```
+
+Here is the full list of possible command line flags:
+```
+  --batch_buffer_size: Number of batches held in shuffling buffer.
+    (default: '32')
+    (an integer)
+  --batch_size: Batch size
+    (default: '2048')
+    (an integer)
+  --config: Configuration string for running pipeline from container.
+  --data_buffer_size: Dataset buffer size.
+    (default: '4096')
+    (an integer)
+  --data_name: Dataset name
+    (default: 'isolet')
+  --dataset_name: BigQuery dataset name for train and test.
+  --decay_rate: Decay rate
+    (default: '0.5')
+    (a number)
+  --decay_steps: Decay steps
+    (default: '500')
+    (an integer)
+  --feature_dim: Feature dimension
+    (an integer)
+  --gcs_output_path: GCS output path.
+  --learning_rate: Learning rate.
+    (default: '0.01')
+    (a number)
+  --logging_filename: Name of the file used for logging discovered or selected
+    features.
+    (default: 'features.json')
+  --model_type: Model type can be feature selection or discovery.
+    (default: 'discovery')
+  --num_mlp_layers: Number of MLP layers in MLP model
+    (default: '2')
+    (an integer)
+  --num_selected_features: Number of features for feature selection
+    (an integer)
+  --num_steps: Number of training steps
+    (default: '50')
+    (an integer)
+  --project_id: The BigQuery project ID.
+  --seed: Random seed
+    (default: '21')
+    (an integer)
+  --target: Name for the training target feature.
+  --task_type: Task type can be classification or regression.
+    (default: 'classification')
+  --test_table_name: Table name of the test dataset.
+  --train_table_name: Table name of the training dataset.
+  --upload_features_to_bq: Whether to upload features to BQ table.
+    (default: 'true')
+```
+Disclaimer: This is not an officially supported Google product.