<a href="https://colab.research.google.com/github/darrengao628/gptchem/blob/main/Fine_tune_ESM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers evaluate datasets requests pandas sklearn huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface_hub
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [3

In [2]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
def data_prep(seq):# Data prepation 
    import pandas as pd
    df=pd.read_csv(seq)
    df.dropna()
    cyto=df["Subcellular location [CC]"].str.contains("Cytosol") | df["Subcellular location [CC]"].str.contains("Cytoplasm")
    mem=df["Subcellular location [CC]"].str.contains("Membrane") | df["Subcellular location [CC]"].str.contains("Cell membrane")
    cyto_df=df[cyto& ~mem]
    cyto_seq=cyto_df["Sequence"].tolist()
    cyto_labels=[0 for protein in cyto_seq]
    mem_df=df[mem& ~cyto]
    mem_seq=mem_df["Sequence"].tolist()
    mem_labels=[1 for protein in mem_seq]
    seq = cyto_seq + mem_seq
    labels=cyto_labels + mem_labels
    return seq,labels

In [4]:
seq,labels=data_prep("/content/500_sequences.csv")
len(labels),len(seq)

(279, 279)

In [5]:
def data_loader_prep (seq, labels,model_checkpoint):
    from sklearn.model_selection import train_test_split
    from transformers import AutoTokenizer
    from datasets import Dataset
    train_seq, test_seq, train_labels, test_labels = train_test_split(seq, labels, test_size =0.25,shuffle=True)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    train_tokenized=tokenizer(train_seq)
    test_tokenized=tokenizer(test_seq)
    train_ds=Dataset.from_dict(train_tokenized)
    test_ds=Dataset.from_dict(test_tokenized)
    train_ds = train_ds.add_column("labels", train_labels)
    test_ds = test_ds.add_column("labels", test_labels)
    return train_ds, train_labels, test_ds,test_labels
    

In [6]:
model_checkpoint="facebook/esm2_t12_35M_UR50D"
train_ds, train_labels, test_ds,test_labels = data_loader_prep (seq, labels, model_checkpoint)
train_ds,test_ds 

Downloading (…)okenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 209
 }), Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 70
 }))

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
num_labels = max(train_labels + test_labels) + 1  # Add 1 since 0 can be a label
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at facebook/esm2_t12_35M_UR50D were not used when initializing EsmForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing EsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t12_35M_UR50D and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight'

In [10]:
model_name=model_checkpoint.split('/')[-1]

In [11]:
batch_size = 8

args = TrainingArguments(
    f"{model_name}_Finetuned_localization_March_14_colab",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

from evaluate import load
import numpy as np

metric = load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)



Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/Darren628/esm2_t12_35M_UR50D_Finetuned_localization_March_14_colab into local empty directory.


In [None]:
trainer.train()

***** Running training *****
  Num examples = 209
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 81
  Number of trainable parameters = 33993843


Epoch,Training Loss,Validation Loss
