In [1]:
import torch
if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU not detected. Check your CUDA installation.")

GPU is available!


In [2]:
import os
import pandas as pd
import numpy as np
import bz2
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler, TrainingArguments, Trainer, EarlyStoppingCallback, TrainerCallback
from huggingface_hub.inference_api import InferenceApi
from datasets import load_dataset, Dataset

from torch.utils.data import DataLoader

import sqlite3 
from sklearn.model_selection import train_test_split
import optuna


from customhead import CustomClassificationHead
import tensorboardX
import gc


In [None]:
#hugging face access token: 
# API_TOKEN = ""
# os.environ["HF_TOKEN"] = ""

#huggingface-cli login ****Edit -> paste**** (not CTRL+V)



In [4]:
# inference = InferenceApi(repo_id="bert-base-uncased", token=API_TOKEN)
# response = inference(inputs="The goal of life is [MASK].", raw_response =True)
# print(response.json())
# Load the Gemini tokenizer
tokenizer = AutoTokenizer.from_pretrained("describeai/gemini")

In [None]:
# import os
# print(os.path.getsize('IMDB_Movies_2021.db'))

In [None]:
# connection = sqlite3.connect('IMDB_Movies_2021.db') 
# tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", connection)
# print(tables)

# query = "SELECT * FROM REVIEWS"
# df = pd.read_sql_query(query, connection)

# connection.close()

In [None]:
#data loading block

df = pd.read_csv("movies.csv")

df = df.dropna()


df.head()
# df.to_csv('movies.csv', index = False)

# df.shape #5450, 5

df['labels'] =df['RATING'].astype(int) -1
# 

In [None]:
# type(df['RATING'][1])
# df['RATING'].isnull().sum()

# df['labels'].value_counts()

In [None]:
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
# def sample_data(df, num_samples, classes_to_keep):
#     # Sample rows, selecting num_samples of each Label.                                      
#     df = (
#         df.groupby("Label")[df.columns]
#         .apply(lambda x: x.sample(num_samples))
#         .reset_index(drop=True)
#     )
#     return df

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples['REVIEW'], truncation=True, padding='max_length', max_length=512) #512 helps keep memory cost down for GPU

# Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True)
# tokenized_dataset['label'][:1]


In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)

test_valid_split = split_dataset['test'].train_test_split(test_size=0.5)

tokenized_dataset_dict = {
    'train': split_dataset['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test']
}

# print(tokenized_dataset)

In [None]:
# # import torchvision
# # print("Version:", torchvision.__version__)
# # print("Location:", torchvision.__file__)

# import numpy as np

# labels_train = np.array([example["labels"] for example in tokenized_dataset_dict["train"]])
# print("Min label:", labels_train.min())
# print("Max label:", labels_train.max())


In [None]:


class EpochAccuracyLoggerCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, metrics=None, **kwargs):
        # Retrieve the current epoch (could be fractional)
        epoch = state.epoch  
        # Extract evaluation metrics, e.g., eval_accuracy and eval_loss if available
        eval_accuracy = metrics.get("eval_accuracy", None) if metrics else None
        eval_loss = metrics.get("eval_loss", None) if metrics else None

        # You might also want to log which trial this is.
        # If you're using hyperparameter search, trial information might not be directly available here.
        # One approach is to include hyperparameter information in the logs via the Trainer's state if desired.
        log_line = f"Epoch {epoch:.2f}: Accuracy = {eval_accuracy}, Loss = {eval_loss}\n"
        with open("epoch_metrics.txt", "a") as f:
            f.write(log_line)
        return control


In [None]:
print("Memory allocated:", torch.cuda.memory_allocated())
print("Memory reserved:", torch.cuda.memory_reserved())


# Suggest hyperparameters
encoder_model = "describeai/gemini"
# pct_finetune = trial.suggest_float("pct_finetune", 0.0, 0.01, 0.1, 0.5 1.0)

# If trial is None, use default hyperparameter values
pct_finetune = 0.1
hidden_size = 512
num_hidden_layers = 2
# pct_finetune = trial.suggest_categorical("pct_finetune", [0.0, 0.01, 0.1, 0.5, 1.0])
# 


# Load the base model
model = AutoModelForSequenceClassification.from_pretrained(encoder_model, num_labels=10)

# Set the problem type (if not already set)
model.config.problem_type = "single_label_classification"

# Customize dropout if desired
model.config.hidden_dropout_prob = 0.1
model.config.attention_probs_dropout_prob = 0.1

model.gradient_checkpointing_enable()

if hasattr(model, "encoder"):
    print(model.encoder)


# Freeze a percentage of the encoder layers
# (Assuming your model has an attribute 'encoder' with layers in model.encoder.block)
try:
    total_layers = len(model.encoder.block)
    num_layers_to_freeze = int(total_layers * (1 - pct_finetune))
    for i, layer in enumerate(model.encoder.block):
        if i < num_layers_to_freeze:
            for param in layer.parameters():
                param.requires_grad = False
except AttributeError:
    print("Model does not have an encoder.block attribute")

In [None]:
NUM_LABELS = 10

def model_init(trial=None):

    gc.collect()
    torch.cuda.empty_cache()
    print("Before model init:")
    print("Memory allocated:", torch.cuda.memory_allocated())
    print("Memory reserved:", torch.cuda.memory_reserved())


    # Suggest hyperparameters
    encoder_model = "describeai/gemini"
    # pct_finetune = trial.suggest_float("pct_finetune", 0.0, 0.01, 0.1, 0.5 1.0)

    # If trial is None, use default hyperparameter values
    if trial is None:
        pct_finetune = 0.1
        hidden_size = 512
        num_hidden_layers = 2
    else:
        pct_finetune = trial.suggest_categorical("pct_finetune", [0.0, 0.1, 0.5, 1.0])
        hidden_size = trial.suggest_categorical("hidden_size", [256, 512, 1024])
        num_hidden_layers = trial.suggest_int("num_hidden_layers", 1, 3)
    # pct_finetune = trial.suggest_categorical("pct_finetune", [0.0, 0.01, 0.1, 0.5, 1.0])
    # 

    
    
    # Load the base model
    model = AutoModelForSequenceClassification.from_pretrained(encoder_model, num_labels=NUM_LABELS)
    
    # Set the problem type (if not already set)
    model.config.problem_type = "single_label_classification"
    
    # Customize dropout if desired
    model.config.hidden_dropout_prob = 0.1
    model.config.attention_probs_dropout_prob = 0.1

    model.gradient_checkpointing_enable()

    if hasattr(model, "encoder"):
        print(model.encoder)


    # Freeze a percentage of the encoder layers

    # Collect all parameters not in the classification head
    non_head_params = [(name, param) for name, param in model.named_parameters() if "classification_head" not in name]

    num_total = len(non_head_params)
    # Calculate how many parameters to freeze: 
    # For example, if pct_finetune=0.1, then freeze 90% of non-head parameters.
    num_to_freeze = int(num_total * (1 - pct_finetune))

    print(f"Total non-head parameters: {num_total}")
    print(f"Freezing {num_to_freeze} parameters out of {num_total}")

    # Freeze the first 'num_to_freeze' parameters in the list
    for name, param in non_head_params[:num_to_freeze]:
        param.requires_grad = False
    




####################################################################################################################################################################################################################

    # Customize classifier head if needed:
    # If you want to add extra hidden layers or adjust the hidden size,
    # you might need to replace the default classification head with your own.
    # For example (this is pseudo-code):
    #
    # from my_custom_heads import CustomClassificationHead

    # hidden_size = 512          # Example hidden size; try 256, 512, or 1024
    # hidden_size = trial.suggest_categorical("hidden_size", [256, 512, 1024])
    # num_hidden_layers = 2      # Number of hidden layers in the head
    # num_hidden_layers = trial.suggest_int("num_hidden_layers", 1, 3)
    num_labels = model.config.num_labels

    model.classification_head = CustomClassificationHead(
        input_dim=model.config.d_model, 
        num_hidden_layers=num_hidden_layers,
        hidden_size=hidden_size,
        num_labels=NUM_LABELS,
    )
    #
    # For now, we'll assume you're using the default head.

    return model


####################################################################################################################################################################################################################

def logging_callback(study, trial):
    print(f"Trial {trial.number} finished with value: {trial.value}")
    with open("trial_log.txt", "a") as f:
        f.write(f"Trial {trial.number}: {trial.params}, Value: {trial.value}\n")
    # Clear GPU cache after each trial
    torch.cuda.empty_cache()
    print("After trial {}:".format(trial.number))
    print("Memory allocated:", torch.cuda.memory_allocated())
    print("Memory reserved:", torch.cuda.memory_reserved())

####################################################################################################################################################################################################################

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=False,
    dataloader_num_workers=3,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Monitor "accuracy" for early stopping
    greater_is_better=True
)

trainer = Trainer(
    model_init=model_init,  # This function will be used to initialize a new model for each trial
    args=training_args,
    train_dataset=tokenized_dataset_dict["train"],
    eval_dataset=tokenized_dataset_dict["validation"],
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(axis=-1) == p.label_ids).mean(),
                                 "eval_loss": p.metrics.get("loss") if hasattr(p, "metrics") else None},
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2), EpochAccuracyLoggerCallback()]
)

# Run hyperparameter search using Optuna
best_trial = trainer.hyperparameter_search(
    backend="optuna",
    n_trials=10,
    direction="maximize"
)

print("Best trial:", best_trial)

In [None]:
# # Load the pre-trained Gemini model
# model = AutoModelForSequenceClassification.from_pretrained("describeai/gemini", num_labels=10) #2 labels since only doing binary classification here, match to num labels for multi-class classification

# model.config.problem_type = "single_label_classification"

# # Customize the configuration
# model.config.hidden_dropout_prob = 0.1  # Reduce overfitting
# model.config.attention_probs_dropout_prob = 0.1

# print(model.config)

In [None]:
# print(model.classification_head)

In [None]:
# for name, param in model.named_parameters():
#     if "classification_head" in name:
#         print(name, param.shape)
# # 

In [None]:
# # Freeze all parameters first
# for param in model.parameters():
#     param.requires_grad = False

# # Unfreeze the classification head (if needed)
# for name, param in model.classification_head.named_parameters():
#     param.requires_grad = True

# # Unfreeze the last 2 encoder blocks (example: if the encoder blocks are named "encoder.block.X")
# # Adjust the key name pattern based on your model's naming convention
# for name, param in model.named_parameters():
#     if "encoder.block" in name:
#         # Extract block number if available (this depends on the naming convention)
#         block_number = int(name.split("encoder.block.")[1].split(".")[0])
#         if block_number >= (model.config.num_layers - 2):  # unfreeze the last 2 blocks
#             param.requires_grad = True

# # Check which parameters will be updated
# trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
# print("Trainable parameters:", trainable_params)


In [None]:


# # Optimizer
# optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

# train_dataloader = DataLoader(split_dataset['train'], shuffle=True, batch_size=16)
# num_epochs = 3

# # Learning rate scheduler
# num_training_steps = len(train_dataloader) * num_epochs
# lr_scheduler = get_scheduler(
#     "linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
# )

# print(f"Total training steps: {num_training_steps}")

In [None]:
# # split_dataset = tokenized_dataset.train_test_split(test_size=0.2)

# # test_valid_split = split_dataset['test'].train_test_split(test_size=0.5)

# # tokenized_dataset_dict = {
# #     'train': split_dataset['train'],
# #     'validation': test_valid_split['train'],
# #     'test': test_valid_split['test']
# # }

# # def convert_labels(example):
# #     # Adjust this mapping based on your actual label names
# #     mapping = {"__label__2": 0, "__label__1": 1}
# #     example["label"] = mapping[example["label"]]
# #     return example

# # tokenized_dataset_dict["train"] = tokenized_dataset_dict["train"].map(convert_labels)
# # tokenized_dataset_dict["validation"] = tokenized_dataset_dict["validation"].map(convert_labels)
# # tokenized_dataset_dict["test"] = tokenized_dataset_dict["test"].map(convert_labels)

# tokenized_dataset_dict["train"] = tokenized_dataset_dict["train"].rename_column("RATING", "labels")
# tokenized_dataset_dict["validation"] = tokenized_dataset_dict["validation"].rename_column("RATING", "labels")
# tokenized_dataset_dict["test"] = tokenized_dataset_dict["test"].rename_column("RATING", "labels")



In [None]:
# missing_labels = tokenized_dataset_dict["train"].filter(lambda x: x["labels"] is None)
# print("Examples with missing labels:", missing_labels)


In [None]:
# df_train = tokenized_dataset_dict["train"].to_pandas()
# df_train['labels'].isnull().sum()

In [None]:
# tokenized_dataset_dict["train"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# tokenized_dataset_dict["validation"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# tokenized_dataset_dict["test"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# from torch.utils.data import DataLoader
# train_dataloader = DataLoader(tokenized_dataset_dict["train"], batch_size=2)
# for batch in train_dataloader:
#     print(batch.keys())
#     break


In [None]:
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"


In [None]:
# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./results",              # Save directory
#     eval_strategy="epoch",        # Evaluate after every epoch
#     learning_rate=2e-5,                 # Initial learning rate
#     per_device_train_batch_size=4,     # Batch size per device
#     num_train_epochs=3,                 # Number of epochs
#     weight_decay=0.01,                  # Regularization
#     logging_dir="./logs",               # Log directory
#     save_total_limit=2,                 # Save only the last 2 checkpoints
# )

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,                         # Your model
#     args=training_args,                  # Training arguments
#     train_dataset=tokenized_dataset_dict['train'],  # Training data
#     eval_dataset=tokenized_dataset_dict['validation'],  # Validation data
# )

# # Start fine-tuning
# trainer.train()

In [None]:
# torch.cuda.empty_cache()
# print("Memory allocated on current device:", torch.cuda.memory_allocated())
# print("Memory reserved on current device:", torch.cuda.memory_reserved())

In [None]:
# # import os
# # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True, max_split_size_mb:128"

# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="epoch",
#     learning_rate=1e-5,
#     per_device_train_batch_size=8,         # smaller batch size
#     gradient_accumulation_steps=2,           # accumulate gradients to simulate a batch size of 4
#     num_train_epochs=5,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps = 10,
#     save_total_limit=2,
#     fp16=False,
#     dataloader_num_workers=4                             # enable mixed precision training
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset_dict['train'],
#     eval_dataset=tokenized_dataset_dict['validation'],
# )

# trainer.train()


In [None]:
# import numpy as np

# labels_train = np.array([example["labels"] for example in tokenized_dataset_dict["train"]])
# print("Min label:", labels_train.min())
# print("Max label:", labels_train.max())
