Copyright 2020 Google LLC.

Licensed aunder the Apache License, Version 2.0 (the "License");

In [None]:
#@title License
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get Generative AI API Key

Use the links below to get your API key!

* https://developers.generativeai.google/tutorials/setup

* https://makersuite.google.com/app/apikey


Then go to "Tools" > "Command Palette" and search for "secrets" and choose to
"Open user secrets tab" and save your API key in a variable e.g. genAI-api-key.





## Installs

NOTE: You will need to "Restart kernel" once the installations are complete before you proceed.

In [None]:
!pip install -Iv pydantic==1.10.2
!pip install matminer
!pip install google-ai-generativelanguage
!pip install google-generativeai



In [None]:
#@title read API key saved in notebook secrets tab.
from google.colab import userdata
YOUR_API_KEY = genai.configure(api_key=userdata.get('genAI-api-key'))


In [None]:
#@title import and check the available models using your API key.
import os
import re
import google.generativeai as genai
import numpy as np
import pandas as pd
import ast
from matminer.datasets.convenience_loaders import load_expt_gap
from matminer.featurizers.composition import ElementProperty
from pymatgen.core import Composition # we will use this to parse formulas in the dataset


genai.configure(api_key=YOUR_API_KEY)
for model in genai.list_models():
  print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001


In [None]:
#@title download the matbench dataset

bandgap_dataset = load_expt_gap() # loads the experimental bandgap dataset
# Convert the formulas to pymatgen composition objects
bandgap_dataset['composition'] = [Composition(formula) for formula in bandgap_dataset['formula']]

# Check if compositions are valid
invalid_compositions = [composition for composition in bandgap_dataset['composition'] if not composition.valid]

# Drop rows with invalid compositions
bandgap_dataset = bandgap_dataset.drop(bandgap_dataset[bandgap_dataset['composition'].isin(invalid_compositions)].index)

# Print number of dropped rows
# print(f'Dropped {len(invalid_compositions)} rows with invalid compositions.')
element_featurizer = ElementProperty.from_preset(preset_name="magpie")
bandgap_dataset = element_featurizer.featurize_dataframe(bandgap_dataset, col_id = 'composition', )
bandgap_dataset = bandgap_dataset[~ bandgap_dataset.isnull().any(axis=1)] # drop rows where we got a nan, in case any.

bandgap_dataset.to_csv('expt_bandgap_w_magpie.csv')

Fetching expt_gap.json.gz from https://ndownloader.figshare.com/files/13464434 to /usr/local/lib/python3.10/dist-packages/matminer/datasets/expt_gap.json.gz


Fetching https://ndownloader.figshare.com/files/13464434 in MB: 0.051199999999999996MB [00:00, 15.02MB/s]     


ElementProperty:   0%|          | 0/6353 [00:00<?, ?it/s]

In [None]:
#@title Helpers to query the model and parse responses

def query_model(query_prompt: str, model_name: str) -> str:
  response = genai.generate_text(
    model=model_name,
    prompt=query_prompt,
    candidate_count=1,
    temperature=0.0,
  )
  return response.result

def write_code_for_task(task_description: str,
                        parsed: bool = False,
                        parse_multiple: bool = False,
                        verbose: bool = True) -> str:
  response = query_model(task_description, 'models/text-bison-001')
  if verbose:
    print(response)
  if parsed:
      parsed_block = response.split('python\n')[1]
      code_block = parsed_block.split('```')[0]
      if verbose:
        print(code_block)
      return code_block
  if parse_multiple:
    multi_code_blocks = []
    for py_code in response.split('python\n')[1:]:
      multi_code_blocks.append(py_code.split('```')[0])
    return "\n".join(multi_code_blocks)
  return response

ok_code_registry = {
    'def load_csv(filename: str) -> pd.DataFrame:\n  """Load a .csv file into'
    ' pandas."""\n  \n  df = pd.read_csv(filename)\n  return df\n'
}
bad_code_registry = set()

def local_exec(code):
  return exec(code, globals(), locals())

def exec_and_register_if_ok(body, env_exec, code_conv_ctx=[]):
  """Call local_exec on body iff it is registered as ok or the response is 'y'."""
  # Mutates module-level variables as a side-effect:
  #   ok_code_registry, bad_code_registry
  do_exec = False
  if body in ok_code_registry:
    do_exec = True
  elif body in bad_code_registry:
    print('Nope!')
  else:
    print('###BEGIN###')
    print(body)
    print('###END###')
    response = input('Is the preceding code ok? (y/n)').lower()
    if response == 'y':
      ok_code_registry.add(body)
      do_exec = True
    elif response == 'n':
      bad_code_registry.add(body)
    else:
      print(f'Unexpected response: {response} is ignored.')
  if do_exec:
    code_ctx = "\n\n".join(code_conv_ctx) + "\n\n" + body
    value = env_exec(code_ctx)
  else:
    value = None
  return do_exec, value

def simple_code_interpreter(env_exec=local_exec):
  stop = False
  executed_codes = []
  while not stop:
    f_description = input('Describe what you want to do? Say stop-stop to end')
    if f_description == 'stop-stop': return
    predicted_f_code = write_code_for_task(f_description, parse_multiple=True)
    _ = exec_and_register_if_ok(predicted_f_code, env_exec,
                                code_conv_ctx=executed_codes)
    executed_codes.append(predicted_f_code)

## Prompt LLM to load the data and train a model.

In [None]:
out6 = write_code_for_task("""
 Load expt_bandgap_w_magpie.csv into a dataframe called bandgap_dataset.
 Except for the first 4 columns save the rest of the column
 names of the bandgap_dataset dataframe in the variable "feature_labels".

 Write code to train and evaluate XGBoost, Linear Regression and Random Forest models
 to predict the band gap (in column "gap expt") from the set of features (take only the columns in "feature_labels")?
 Do 5-fold cross validation and use negative mean absolute error for scoring.
 Import necessary libraries.""",
                           parse_multiple=True,
                           verbose=True)

_ = exec_and_register_if_ok(out6, local_exec)


```python
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
```

Load the data and extract the feature labels.
```python
bandgap_dataset = pd.read_csv("expt_bandgap_w_magpie.csv")
feature_labels = bandgap_dataset.columns[4:]
```

Split the data into training and test sets.
```python
X_train, X_test, y_train, y_test = train_test_split(bandgap_dataset[feature_labels], bandgap_dataset["gap expt"], test_size=0.2)
```

Train and evaluate XGBoost, Linear Regression and Random Forest models.
```python
# XGBoost
xgboost_model = XGBRegressor()
xgboost_scores = cross_val_score(xgboost_model, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
print("XGBoost mean absolute error:", np.mean(xgboost_scores))

# Linear Regression
linear_regression_model = LinearRegression()


## Try out different prompts

In [None]:
# @title Check code generation for different prompts for loading data and model evaluation.
out1 = write_code_for_task(
    'Write python code to load a csv file into a dataframe',
    parsed=True, verbose=True)

out2 = write_code_for_task("""Write python code to Load file
 expt_bandgap_w_magpie.csv into a pandas dataframe called bandgap_dataset.
 print head of the dataframe""",
                           parsed=True,
                           verbose=True)

out3 = write_code_for_task("""Except for the first 4 columns save the rest of the column
names of the bandgap_dataset dataframe in the variable "feature_labels"
print first 5 items of feature_labels. ```""",
                           parsed=True,
                           verbose=True)

out4 = write_code_for_task("""You have  a dataframe called
 bandgap_dataset. It has columns "formula" and "gap expt", and several columns
 with features the feature columns are saved in the variable "feature_labels".
 Write code to train and evaluate XGBoost, Linear Regression and Random Forest models
 to predict the band gap ("gap expt") from the features (only take only the columns in "feature_labels")?
 Use negative mean absolute error for scoring. Import necessary libraries.""",
                           parsed=True,
                           verbose=True)

out5 = write_code_for_task("""
 Load expt_bandgap_w_magpie.csv into a dataframe called bandgap_dataset.
 Except for the first 4 columns save the rest of the column
 names of the bandgap_dataset dataframe in the variable "feature_labels"
 Write code to train and evaluate XGBoost, Linear Regression and Random Forest models
 to predict the band gap (in column "gap expt") from the set of features (take only the columns in "feature_labels")?
 Use negative mean absolute error for scoring. Import necessary libraries.""",
                           parsed=True,
                           verbose=True)

out6 = write_code_for_task("""
 Load expt_bandgap_w_magpie.csv into a dataframe called bandgap_dataset.
 Except for the first 4 columns save the rest of the column
 names of the bandgap_dataset dataframe in the variable "feature_labels".

 Write code to train and evaluate XGBoost, Linear Regression and Random Forest models
 to predict the band gap (in column "gap expt") from the set of features (take only the columns in "feature_labels")?
 Do 5-fold cross validation and use negative mean absolute error for scoring.
 Import necessary libraries.""",
                           parse_multiple=True,
                           verbose=True)

output_responses = [out1, out2, out3, out4, out5, out6]


In [None]:
#@title Execute single prompt to train and compare models with 5-fold cross validation
_ = exec_and_register_if_ok(out6, my_exec)

In [None]:
_ = exec_and_register_if_ok(out6, local_exec)

XGBoost MAE: -0.4289969593568488
Linear Regression MAE: -42.91331668878381
Random Forest MAE: -0.438315558836152


In [None]:
#@title Execute training XGBoost, LinearRegression and RandomForest models.
_ = exec_and_register_if_ok(out5, local_exec)


###BEGIN###
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Load the data
bandgap_dataset = pd.read_csv("expt_bandgap_w_magpie.csv")

# Save the feature labels
feature_labels = bandgap_dataset.columns[4:]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    bandgap_dataset[feature_labels], bandgap_dataset["gap expt"], test_size=0.2
)

# Train the XGBoost model
xgboost_model = XGBRegressor()
xgboost_model.fit(X_train, y_train)

# Train the Linear Regression model
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

# Train the Random Forest model
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X_train, y_train)

# Evaluate the models
xgboost_mae = mean_absolute_error(y_te

## Execute prompts in a code-interpretor-style step by step

In [None]:
#@title Do it step by step using multiple prompts.
"""Write python code to Load file
 expt_bandgap_w_magpie.csv into a pandas dataframe called bandgap_dataset.
 print head of the dataframe"""

"""Except for the first 4 columns save the rest of the column
names of the bandgap_dataset dataframe in the variable "feature_labels"
print first 5 items of feature_labels. ```"""

"""You have  a dataframe called bandgap_dataset it was loaded from file expt_bandgap_w_magpie.csv.
 It has columns "formula" and "gap expt", and several columns
 with features. Except for the first 4 columns save the rest of the column
  names of the bandgap_dataset dataframe in the variable "feature_labels".
 Write code to train and evaluate XGBoost, Linear Regression and Random Forest models
 to predict the band gap ("gap expt") from the features (only take only the columns in "feature_labels")?
 Use negative mean absolute error for scoring. Import necessary libraries."""

simple_code_interpreter(env_exec=local_exec)