In [2]:
pip list

Package                      Version
---------------------------- ------------
absl-py                      2.1.0
accelerate                   0.30.1
aiohappyeyeballs             2.6.1
aiohttp                      3.13.3
aiosignal                    1.4.0
altair                       5.5.0
annotated-types              0.7.0
anyio                        4.6.2
argon2-cffi                  21.3.0
argon2-cffi-bindings         21.2.0
asttokens                    2.0.5
astunparse                   1.6.3
async-timeout                5.0.1
attrs                        24.3.0
backcall                     0.2.0
beautifulsoup4               4.12.3
bleach                       6.2.0
blinker                      1.9.0
Bottleneck                   1.4.2
Brotli                       1.0.9
cachetools                   5.5.1
catboost                     1.2.8
category-encoders            2.6.4
certifi                      2025.1.31
cffi                         1.17.1
charset-normalizer           3.4.1


In [1]:
import pandas as pd
import numpy as np
import torch

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [2]:
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


CUDA Available: True
GPU Name: NVIDIA GeForce RTX 2050


In [3]:
data = pd.read_csv(
    "movie_genre_train_data.txt",
    sep=":::",
    engine="python",
    names=["id", "title", "genre", "summary"]
)




In [4]:
max_per_class = 3000

df = (
    data.groupby("genre")
      .apply(lambda x: x.sample(min(len(x), max_per_class), random_state=42))
      .reset_index(drop=True)
)

print(df["genre"].value_counts())
print("New total samples:", len(df))

genre
documentary     3000
short           3000
drama           3000
comedy          3000
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            590
crime            505
animation        498
sport            432
talk-show        391
fantasy          323
mystery          319
musical          277
biography        265
history          243
game-show        194
news             181
war              132
Name: count, dtype: int64
New total samples: 26985


  data.groupby("genre")


In [5]:
def merge_genres(genre):

    if genre in ["history", "war", "biography"]:
        return "historical"
    elif genre in ["game-show", "talk-show", "news"]:
        return "tv"
    elif genre in ["fantasy", "animation"]:
        return "fantasy"
    elif genre in ["musical", "music"]:
        return "music"
    elif genre in ["crime", "mystery"]:
        return "crime"
    else:
        return genre



In [6]:
df['genre']=df['genre'].str.replace(" ","")

In [7]:
df["genre"] = df["genre"].apply(merge_genres)

In [8]:

print("Total samples:", len(df))


Total samples: 26985


In [9]:
df.genre.value_counts()

genre
documentary    3000
short          3000
comedy         3000
drama          3000
horror         2204
thriller       1591
action         1315
western        1032
music          1008
reality-tv      884
crime           824
fantasy         821
family          784
adventure       775
tv              766
romance         672
sci-fi          647
historical      640
adult           590
sport           432
Name: count, dtype: int64

In [10]:
len(df)

26985

In [11]:
class DataPreprocessing(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df["text"] = df["title"].fillna("") + " " + df["summary"].fillna("")
        df["text"] = df["text"].str.strip()
        return df["text"].tolist()


In [12]:
preprocessor = DataPreprocessing()
df["text"] = preprocessor.fit_transform(df)


In [13]:
label_encoder = LabelEncoder()


In [14]:
df["label"] = label_encoder.fit_transform(df["genre"])
num_labels = len(label_encoder.classes_)
print("Number of classes:", num_labels)


Number of classes: 20


In [15]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["label"]),
    y=df["label"]
)


In [16]:
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)


In [17]:
dataset = Dataset.from_dict({
    "text": df["text"].tolist(),
    "label": df["label"].tolist()
})

dataset = dataset.train_test_split(test_size=0.1, seed=42)


In [18]:
# import shutil
# import os

# cache_dir = os.path.expanduser("~/.cache/huggingface")
# shutil.rmtree(cache_dir, ignore_errors=True)

# print("Cache cleared")

In [19]:
# !pip install sentencepiece

In [20]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=False
)

In [21]:
# !pip install \
# transformers==4.41.2 \
# huggingface_hub==0.23.2 \
# tokenizers==0.19.1 \
# datasets==2.19.1 \
# accelerate==0.30.1

In [23]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=128
    )


In [24]:
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["text"])
dataset.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 24286/24286 [00:15<00:00, 1561.88 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2699/2699 [00:01<00:00, 1447.31 examples/s]


In [25]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

model.to(device)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [26]:

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs["logits"]

        loss_fct = torch.nn.CrossEntropyLoss(
            weight=class_weights
        )
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



In [27]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }


In [28]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,   
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,                      
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
    report_to=[]
)





In [29]:

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [30]:
torch.cuda.empty_cache()

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
0,1.5925,1.536625,0.525009,0.486155,0.508719
1,1.2444,1.388453,0.580956,0.555011,0.5807
2,1.0043,1.3832,0.585402,0.553894,0.58231


TrainOutput(global_step=4551, training_loss=1.4884355951471397, metrics={'train_runtime': 6716.4808, 'train_samples_per_second': 10.848, 'train_steps_per_second': 0.678, 'total_flos': 4306187702135088.0, 'train_loss': 1.4884355951471397, 'epoch': 2.9982706085810755})

In [66]:

trainer.save_model("deberta_movie_genre_model_gpu_v1")
tokenizer.save_pretrained("deberta_movie_genre_model_gpu_v1")

print("ðŸš€ GPU Training Completed Successfully!")

ðŸš€ GPU Training Completed Successfully!


In [70]:
import joblib

In [72]:
joblib.dump(label_encoder,'label_encoder_movie_genre_model_gpu_v1.pkl')

['label_encoder_movie_genre_model_gpu_v1.pkl']

In [68]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 1.3884531259536743, 'eval_accuracy': 0.5809559095961467, 'eval_f1_macro': 0.5550109319001181, 'eval_f1_weighted': 0.5807000930423225, 'eval_runtime': 59.777, 'eval_samples_per_second': 45.151, 'eval_steps_per_second': 22.584, 'epoch': 2.9982706085810755}


In [74]:
test=pd.read_csv('movie_genre_test_data.txt',sep=':::',engine='python',names=['id','title','genre','summary'])

In [84]:
test.sample(2)

Unnamed: 0,id,title,genre,summary
7722,7723,Aranyer Din Ratri (1970),drama,A group of four middle class workers in India...
3189,3190,Giorni e nuvole (2007),drama,Manager Michele lost his job but didn't tell ...


In [86]:
x_test=test.iloc[0:13]
y_test=test['genre'].iloc[0:13]

In [90]:
x_test.drop(['title','genre'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test.drop(['title','genre'],axis=1,inplace=True)


In [92]:
y_test

0         thriller 
1           comedy 
2      documentary 
3            drama 
4            drama 
5           horror 
6            drama 
7           comedy 
8      documentary 
9            drama 
10           drama 
11           drama 
12           drama 
Name: genre, dtype: object

In [78]:
from datasets import Dataset

text = "A detective investigates a mysterious murder."

predict_dataset = Dataset.from_dict({
    "text": [text]
})

In [80]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

predict_dataset = predict_dataset.map(tokenize_function, batched=True)
predict_dataset = predict_dataset.remove_columns(["text"])
predict_dataset.set_format("torch")

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  5.88 examples/s]


In [81]:
predictions = trainer.predict(predict_dataset)

import numpy as np

pred_class = np.argmax(predictions.predictions, axis=1)[0]

print("Predicted class id:", pred_class)

Predicted class id: 4


In [96]:
import torch
import numpy as np

model.eval()

def predict_genre(summary_text):
    inputs = tokenizer(
        summary_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )

    with torch.no_grad():
        outputs = model(**inputs)

    predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
    
    return label_classes[predicted_class_id]

In [None]:
def predict_genre(summary_text):
    inputs = tokenizer(
        summary_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )

    with torch.no_grad():
        outputs = trainer(**inputs)

    predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
    
    return label_classes[predicted_class_id]