In [1]:
import torch
if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU not detected. Check your CUDA installation.")

GPU is available!


In [2]:
import os
import pandas as pd
import numpy as np
import bz2
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler, TrainingArguments, Trainer
from huggingface_hub.inference_api import InferenceApi
from datasets import load_dataset, Dataset

from torch.utils.data import DataLoader

import sqlite3 
from sklearn.model_selection import train_test_split



In [None]:
#hugging face access token: 
# API_TOKEN = ""
# os.environ["HF_TOKEN"] = ""

#huggingface-cli login ****Edit -> paste**** (not CTRL+V)


In [4]:
inference = InferenceApi(repo_id="bert-base-uncased", token=API_TOKEN)
# response = inference(inputs="The goal of life is [MASK].", raw_response =True)
# print(response.json())
# Load the Gemini tokenizer
tokenizer = AutoTokenizer.from_pretrained("describeai/gemini")



In [5]:
# import os
# print(os.path.getsize('IMDB_Movies_2021.db'))

In [6]:
# connection = sqlite3.connect('IMDB_Movies_2021.db') 
# tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", connection)
# print(tables)

# query = "SELECT * FROM REVIEWS"
# df = pd.read_sql_query(query, connection)

# connection.close()

In [7]:
#data loading block

df = pd.read_csv("movies.csv")

df = df.dropna()


df.head()
# df.to_csv('movies.csv', index = False)

# df.shape #5450, 5

df['labels'] =df['RATING'].astype(int) -1
# 

In [8]:
# type(df['RATING'][1])
# df['RATING'].isnull().sum()


In [9]:
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [10]:
def sample_data(df, num_samples, classes_to_keep):
    # Sample rows, selecting num_samples of each Label.                                      
    df = (
        df.groupby("Label")[df.columns]
        .apply(lambda x: x.sample(num_samples))
        .reset_index(drop=True)
    )
    return df

In [11]:
dataset = Dataset.from_pandas(df)

In [12]:
# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples['REVIEW'], truncation=True, padding='max_length', max_length=512) #512 helps keep memory cost down for GPU

# Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True)
# tokenized_dataset['label'][:1]


Map:   0%|          | 0/5332 [00:00<?, ? examples/s]

In [13]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)

test_valid_split = split_dataset['test'].train_test_split(test_size=0.5)

tokenized_dataset_dict = {
    'train': split_dataset['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test']
}

# print(tokenized_dataset)

In [14]:
# # import torchvision
# # print("Version:", torchvision.__version__)
# # print("Location:", torchvision.__file__)

# import numpy as np

# labels_train = np.array([example["labels"] for example in tokenized_dataset_dict["train"]])
# print("Min label:", labels_train.min())
# print("Max label:", labels_train.max())


In [15]:
# Load the pre-trained Gemini model
model = AutoModelForSequenceClassification.from_pretrained("describeai/gemini", num_labels=10) #2 labels since only doing binary classification here, match to num labels for multi-class classification

model.config.problem_type = "single_label_classification"

# Customize the configuration
model.config.hidden_dropout_prob = 0.1  # Reduce overfitting
model.config.attention_probs_dropout_prob = 0.1

print(model.config)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at describeai/gemini and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


T5Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "describeai/gemini",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": 0.0,
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "hidden_dropout_prob": 0.1,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions

In [16]:
print(model.classification_head)

T5ClassificationHead(
  (dense): Linear(in_features=1024, out_features=1024, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (out_proj): Linear(in_features=1024, out_features=10, bias=True)
)


In [17]:
for name, param in model.named_parameters():
    if "classification_head" in name:
        print(name, param.shape)


classification_head.dense.weight torch.Size([1024, 1024])
classification_head.dense.bias torch.Size([1024])
classification_head.out_proj.weight torch.Size([10, 1024])
classification_head.out_proj.bias torch.Size([10])


In [18]:
# Freeze all parameters first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the classification head (if needed)
for name, param in model.classification_head.named_parameters():
    param.requires_grad = True

# Unfreeze the last 2 encoder blocks (example: if the encoder blocks are named "encoder.block.X")
# Adjust the key name pattern based on your model's naming convention
for name, param in model.named_parameters():
    if "encoder.block" in name:
        # Extract block number if available (this depends on the naming convention)
        block_number = int(name.split("encoder.block.")[1].split(".")[0])
        if block_number >= (model.config.num_layers - 2):  # unfreeze the last 2 blocks
            param.requires_grad = True

# Check which parameters will be updated
trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable parameters:", trainable_params)


Trainable parameters: ['transformer.encoder.block.22.layer.0.SelfAttention.q.weight', 'transformer.encoder.block.22.layer.0.SelfAttention.k.weight', 'transformer.encoder.block.22.layer.0.SelfAttention.v.weight', 'transformer.encoder.block.22.layer.0.SelfAttention.o.weight', 'transformer.encoder.block.22.layer.0.layer_norm.weight', 'transformer.encoder.block.22.layer.1.DenseReluDense.wi.weight', 'transformer.encoder.block.22.layer.1.DenseReluDense.wo.weight', 'transformer.encoder.block.22.layer.1.layer_norm.weight', 'transformer.encoder.block.23.layer.0.SelfAttention.q.weight', 'transformer.encoder.block.23.layer.0.SelfAttention.k.weight', 'transformer.encoder.block.23.layer.0.SelfAttention.v.weight', 'transformer.encoder.block.23.layer.0.SelfAttention.o.weight', 'transformer.encoder.block.23.layer.0.layer_norm.weight', 'transformer.encoder.block.23.layer.1.DenseReluDense.wi.weight', 'transformer.encoder.block.23.layer.1.DenseReluDense.wo.weight', 'transformer.encoder.block.23.layer.1.l

In [19]:


# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

train_dataloader = DataLoader(split_dataset['train'], shuffle=True, batch_size=16)
num_epochs = 3

# Learning rate scheduler
num_training_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

print(f"Total training steps: {num_training_steps}")

Total training steps: 801




In [20]:
# # split_dataset = tokenized_dataset.train_test_split(test_size=0.2)

# # test_valid_split = split_dataset['test'].train_test_split(test_size=0.5)

# # tokenized_dataset_dict = {
# #     'train': split_dataset['train'],
# #     'validation': test_valid_split['train'],
# #     'test': test_valid_split['test']
# # }

# # def convert_labels(example):
# #     # Adjust this mapping based on your actual label names
# #     mapping = {"__label__2": 0, "__label__1": 1}
# #     example["label"] = mapping[example["label"]]
# #     return example

# # tokenized_dataset_dict["train"] = tokenized_dataset_dict["train"].map(convert_labels)
# # tokenized_dataset_dict["validation"] = tokenized_dataset_dict["validation"].map(convert_labels)
# # tokenized_dataset_dict["test"] = tokenized_dataset_dict["test"].map(convert_labels)

# tokenized_dataset_dict["train"] = tokenized_dataset_dict["train"].rename_column("RATING", "labels")
# tokenized_dataset_dict["validation"] = tokenized_dataset_dict["validation"].rename_column("RATING", "labels")
# tokenized_dataset_dict["test"] = tokenized_dataset_dict["test"].rename_column("RATING", "labels")



In [21]:
# missing_labels = tokenized_dataset_dict["train"].filter(lambda x: x["labels"] is None)
# print("Examples with missing labels:", missing_labels)


In [22]:
# df_train = tokenized_dataset_dict["train"].to_pandas()
# df_train['labels'].isnull().sum()

In [23]:
# tokenized_dataset_dict["train"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# tokenized_dataset_dict["validation"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# tokenized_dataset_dict["test"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [24]:
# from torch.utils.data import DataLoader
# train_dataloader = DataLoader(tokenized_dataset_dict["train"], batch_size=2)
# for batch in train_dataloader:
#     print(batch.keys())
#     break


In [25]:
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"


In [26]:
# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./results",              # Save directory
#     eval_strategy="epoch",        # Evaluate after every epoch
#     learning_rate=2e-5,                 # Initial learning rate
#     per_device_train_batch_size=4,     # Batch size per device
#     num_train_epochs=3,                 # Number of epochs
#     weight_decay=0.01,                  # Regularization
#     logging_dir="./logs",               # Log directory
#     save_total_limit=2,                 # Save only the last 2 checkpoints
# )

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,                         # Your model
#     args=training_args,                  # Training arguments
#     train_dataset=tokenized_dataset_dict['train'],  # Training data
#     eval_dataset=tokenized_dataset_dict['validation'],  # Validation data
# )

# # Start fine-tuning
# trainer.train()

In [27]:
torch.cuda.empty_cache()
print("Memory allocated on current device:", torch.cuda.memory_allocated())
print("Memory reserved on current device:", torch.cuda.memory_reserved())

Memory allocated on current device: 0
Memory reserved on current device: 0


In [28]:
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True, max_split_size_mb:128"

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,         # smaller batch size
    gradient_accumulation_steps=2,           # accumulate gradients to simulate a batch size of 4
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps = 10,
    save_total_limit=2,
    fp16=False,
    dataloader_num_workers=4                             # enable mixed precision training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_dict['train'],
    eval_dataset=tokenized_dataset_dict['validation'],
)

trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import numpy as np

labels_train = np.array([example["labels"] for example in tokenized_dataset_dict["train"]])
print("Min label:", labels_train.min())
print("Max label:", labels_train.max())
