# Movie Genre Prediction

In [1]:
# Loading librarys
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

## Loading the Data

In [2]:
df=pd.read_csv('train_data.txt', sep=' ::: ', header=None)
df.head()

  df=pd.read_csv('train_data.txt', sep=' ::: ', header=None)


Unnamed: 0,0,1,2,3
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [3]:
# Renaming the columns
df = df.drop(columns=[0])
df = df.rename(columns={1: "Title", 2: "Genre", 3: "Description"})
df.head()

Unnamed: 0,Title,Genre,Description
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [4]:
# Create a new dataframe with two columns
df1 = df[['Genre', 'Title', 'Description']].copy()

# Remove missing values (NaN)
df1 = df1[pd.notnull(df1['Description'])]
df1.shape

(54214, 3)

In [5]:
# make two smaller datasets for train and test
df2 = df1.sample(2000, random_state=10).copy()
df1 = df1.sample(10000, random_state=1).copy()

In [9]:
df2.head()

Unnamed: 0,Genre,Title,Description
50161,western,Great Day in the Morning (1956),After a card game Southerner Owen Pentecost fi...
31800,documentary,Inside the Kill Box: Fighting the Gulf War (2001),Drawn from 200 hours of interviews with over 6...
37562,drama,San Fran (????),"Levi, a struggling unemployed divorcee, reluct..."
47506,thriller,Black Butterfly (2017/II),Outside a mountain town grappling with a serie...
11609,comedy,Bollywood Beats (2009),"Raj, a lovable Indian guy, is down on his luck..."


In [15]:
#check for same data in train and test
idx = df1.index.intersection(df2.index)
print(len(idx))

356


Train and Test sets overlap by 356 entries, so the evaluation is not perfect

## Create Model
using pretrained BERT from Huggingface and finetune it

In [None]:
label_encoder = LabelEncoder()

# Encoding, Genre to numbers
y_train_encoded = label_encoder.fit_transform(df1['Genre'])
print(len(label_encoder.classes_))
y_test_encoded = label_encoder.fit_transform(df2['Genre'])
print(len(label_encoder.classes_))

# Model selection, use BERT
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
embedding_model = BertModel.from_pretrained(model_name)

# enable GPU use if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model.to(device)

# Tokenizer & embeddings
max_length = 12
concatenated_text = df1['Description'] + " " + df1['Title']
concatenated_text2 = df2['Description'] + " " + df2['Title']
encoded_inputs = tokenizer(list(concatenated_text), padding='max_length', truncation=True, max_length=max_length, return_attention_mask=True)
encoded_tests = tokenizer(list(concatenated_text2), padding='max_length', truncation=True, max_length=max_length, return_attention_mask=True)

# Create DataLoader for train dataset 
batch_size= 16
train_dataset = TensorDataset(torch.tensor(encoded_inputs['input_ids']), torch.tensor(encoded_inputs['attention_mask']), torch.tensor(y_train_encoded))
test_dataset = TensorDataset(torch.tensor(encoded_inputs['input_ids']), torch.tensor(encoded_inputs['attention_mask']), torch.tensor(y_train_encoded))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# NETWORK definition,,
class CustomClassifier(nn.Module):
    def __init__(self, embedding_model, num_classes):
        super(CustomClassifier, self).__init__()
        self.embedding_model = embedding_model
        self.fc = nn.Linear(embedding_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        logits = self.fc(embeddings)
        return logits

# Create the new model and move to GPU
num_classes = len(label_encoder.classes_)
model = CustomClassifier(embedding_model, num_classes)
model.to(device)

# Optimizer & loss
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()


## Training and Evaluation

Model is trained for 4 and 20 epochs, to see difference

In [None]:
# Training and Evaluation
epochs = 20
model.train()
for epoch in range(epochs):
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/20", leave=False)
    total_correct = 0
    total_samples = 0
    
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.type(torch.LongTensor).to(device) for item in batch]
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        
        #  accuracy 
        _, predicted = torch.max(logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
        
        progress_bar.set_postfix({"loss": loss.item(), "accuracy": accuracy})

    # Accuracy in epochs during training
    print(f'Epoch {epoch + 1} - Accuracy: {accuracy:.4f}')


# Evaluation of model
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    progress_bar = tqdm(test_loader, desc="Evaluating", leave=False)
    for batch in progress_bar:
        input_ids, attention_mask, labels = [item.type(torch.LongTensor).to(device) for item in batch]
        logits = model(input_ids, attention_mask)
        _, predicted = torch.max(logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        progress_bar.set_postfix({"accuracy": total_correct / total_samples})

accuracy = total_correct / total_samples
print(f'Final Accuracy: {accuracy:.4f}') # Accuracy for Test

# Model trained for 4 Epochs
Epoch 2 - Accuracy: 0.4755
                                                                                         
Epoch 3 - Accuracy: 0.5416
                                                                                         
Epoch 4 - Accuracy: 0.6057
                                                                             
Final Accuracy: 0.7207

# Model trained for 20 Epochs
Epoch 2 - Accuracy: 0.4973
                                                                                         
Epoch 3 - Accuracy: 0.5755
                                                                                         
Epoch 4 - Accuracy: 0.6635
                                                                                         
Epoch 5 - Accuracy: 0.7525
                                                                                         
Epoch 6 - Accuracy: 0.8268
                                                                                          
Epoch 7 - Accuracy: 0.8775
                                                                                          
Epoch 8 - Accuracy: 0.9123
                                                                                          
Epoch 9 - Accuracy: 0.9371
                                                                                           
Epoch 10 - Accuracy: 0.9587
                                                                                           
Epoch 11 - Accuracy: 0.9688
                                                                                            
Epoch 12 - Accuracy: 0.9762
                                                                                            
Epoch 13 - Accuracy: 0.9841
                                                                                            
Epoch 14 - Accuracy: 0.9856
                                                                                            
Epoch 15 - Accuracy: 0.9885
                                                                                            
Epoch 16 - Accuracy: 0.9915
                                                                                            
Epoch 17 - Accuracy: 0.9921
                                                                                            
Epoch 18 - Accuracy: 0.9919
                                                                                            
Epoch 19 - Accuracy: 0.9917
                                                                                            
Epoch 20 - Accuracy: 0.9956
                                                                             
Final Accuracy: 0.9985

With a pretrainded model the training don't have to be that long, to achieve very good results