In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("IMDb movies.csv")
df = df[['genre','description']]
df.head()

Unnamed: 0,genre,description
0,Romance,The adventures of a female reporter in the 1890s.
1,"Biography, Crime, Drama",True story of notorious Australian outlaw Ned ...
2,Drama,Two men of high rank are both wooing the beaut...
3,"Drama, History",The fabled queen of Egypt's affair with Roman ...
4,"Adventure, Drama, Fantasy",Loosely adapted from Dante's Divine Comedy and...


In [4]:
# filter null values
df.isna().any()

genre          False
description     True
dtype: bool

In [5]:
df.dropna(inplace = True)
df.reset_index(drop=True, inplace = True)

In [6]:
# Convert multi-label splitted by comma into lists
for i in range(len(df["genre"])):
    df["genre"][i] = df["genre"][i].replace(" ", "")
    df["genre"][i] = list(df["genre"][i].split(","))

In [7]:
# Simplify multi-label into multi-class problem
df['nGenre'] = [0 for i in range(len(df))]
for i in range(len(df)):
    df['nGenre'][i] = len(df['genre'][i])

df = df[df['nGenre']==1]

In [8]:
df = df[['genre','description']]
df['genre'] = [''.join(i) for i in df['genre']]
df.reset_index(drop=True, inplace = True)

In [9]:
# Explore genres with abundant data
df['genre'].value_counts()[0:7]

Drama       12105
Comedy       7146
Horror       2241
Thriller     1217
Action        699
Western       588
Romance       415
Name: genre, dtype: int64

In [10]:
df['genre'].value_counts()[0:7].index

Index(['Drama', 'Comedy', 'Horror', 'Thriller', 'Action', 'Western',
       'Romance'],
      dtype='object')

In [11]:
for genre in df['genre'].value_counts()[0:7].index:
    # Extract the first three letters of the genre in lower case
    short_name = genre[:3].lower()
    # Create a DataFrame using the abbreviated genre name
    variable_name = 'df_' + short_name
    # Create a global variable with the constructed name
    globals()[variable_name] = df[df['genre'] == genre].reset_index(drop=True)[:400]

In [12]:
# New dataframe with even-sized data in 7 genres
data = pd.concat([df_dra, df_com, df_hor, df_thr, df_act, df_wes, df_rom])
data

Unnamed: 0,genre,description
0,Drama,Two men of high rank are both wooing the beaut...
1,Drama,Richard of Gloucester uses manipulation and mu...
2,Drama,After Dr. Friedrich's wife becomes mentally un...
3,Drama,Single mother is separated from her children d...
4,Drama,"Leslie Swayne, an adventurer, in order to obta..."
...,...,...
395,Romance,"Sato is 27 years old, lives in the northern pr..."
396,Romance,"A family entertainer, the story of Ammammagari..."
397,Romance,"Tej, a youngster who's highly attached to his ..."
398,Romance,The film is a rom-com which explores the life ...


In [13]:
# Convert to all lower cases and strip spaces at both sides of each text string
data['description'] = [str(i).lower().strip() for i in data['description']]
data.reset_index(drop = True, inplace = True)

In [14]:
# remove all non-english character (except for space), including punctuations
non_english_char = [chr(i) for i in range(0, 32)] + [chr(i) for i in range(33, 65)] + [chr(i) for i in range(92, 97)] + [chr(i) for i in range(123, 256)]
for i in range(len(data)):
    for char in non_english_char:
        data["description"][i] = str(data["description"][i]).replace(char, '')

In [15]:
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

# defining the function for lemmatization
def lemmatizer(data):
    wordlist = str(data).split(" ")
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in wordlist]
    lemm_text= [wordnet_lemmatizer.lemmatize(word, pos='v') for word in lemm_text]
    lemm_text = [wordnet_lemmatizer.lemmatize(word, pos='n') for word in lemm_text]
    lemm_text = [wordnet_lemmatizer.lemmatize(word, pos='a') for word in lemm_text]
    lemm_text = [wordnet_lemmatizer.lemmatize(word, pos='r') for word in lemm_text]
    lemm_text = ' '.join(lemm_text)
    return lemm_text

# lemmatize each no-stopword-text reviews
data['description']=data['description'].apply(lemmatizer)

In [16]:
data

Unnamed: 0,genre,description
0,Drama,two men of high rank be both woo the beautiful...
1,Drama,richard of gloucester u manipulation and murde...
2,Drama,after dr friedrichs wife become mentally unsta...
3,Drama,single mother be separate from her child due t...
4,Drama,leslie swayne an adventurer in order to obtain...
...,...,...
2795,Romance,sato be year old life in the northern prefect...
2796,Romance,a family entertainer the story of ammammagaril...
2797,Romance,tej a youngster who highly attach to his famil...
2798,Romance,the film be a romcom which explore the life of...


In [17]:
# Prepare features and targets
# Split train and test sets
X = data['description']
y = data['genre']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
train_dataset = pd.concat([x_train, y_train], axis=1)
test_dataset = pd.concat([x_test, y_test], axis=1)

In [19]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

In [20]:
# Encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Model selection
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
embedding_model = BertModel.from_pretrained(model_name)

# Device selection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model.to(device)

# Tokenizer & embeddings
max_length = 20
encoded_inputs = tokenizer(list(train_dataset['description']), padding='max_length', truncation=True, 
                           max_length=max_length, return_attention_mask=True)

train_dataset = TensorDataset(torch.tensor(encoded_inputs['input_ids']), 
                              torch.tensor(encoded_inputs['attention_mask']), torch.tensor(y_train_encoded))
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)

encoded_inputs_test = tokenizer(list(test_dataset['description']), padding='max_length', truncation=True, 
                                max_length=max_length, return_attention_mask=True)
y_test_encoded = label_encoder.fit_transform(y_test)

test_dataset = TensorDataset(torch.tensor(encoded_inputs_test['input_ids']), 
                             torch.tensor(encoded_inputs_test['attention_mask']), torch.tensor(y_test_encoded))
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
# nn definition
class GenreClassifier(nn.Module):
    def __init__(self, embedding_model, num_classes):
        super(GenreClassifier, self).__init__()
        self.embedding_model = embedding_model
        self.fc = nn.Linear(embedding_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        logits = self.fc(embeddings)
        return logits

# Create the new model and move to device
num_classes = len(label_encoder.classes_)
model = GenreClassifier(embedding_model, num_classes)
model.to(device)

# Optimizer & loss
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

In [22]:
# Train...
model.train()
for epoch in range(10):  
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/10", leave=False)
    total_correct = 0
    total_samples = 0
    
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        
        # accuracy 
        _, predicted = torch.max(logits, 1)
#         predicted = logits
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
        
        progress_bar.set_postfix({"loss": loss.item(), "accuracy": accuracy})
        
        # Accuracy * epoch
    print(f'Epoch {epoch + 1} - Accuracy: {accuracy:.4f}')

                                                                                

Epoch 1 - Accuracy: 0.4223


                                                                                

Epoch 2 - Accuracy: 0.6759


                                                                                

Epoch 3 - Accuracy: 0.7772


                                                                                

Epoch 4 - Accuracy: 0.8638


                                                                                

Epoch 5 - Accuracy: 0.9313


                                                                                

Epoch 6 - Accuracy: 0.9670


                                                                                

Epoch 7 - Accuracy: 0.9884


                                                                                

Epoch 8 - Accuracy: 0.9955


                                                                                

Epoch 9 - Accuracy: 0.9982


                                                                                

Epoch 10 - Accuracy: 0.9982




In [23]:
# Evaluate...
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    progress_bar = tqdm(test_loader, desc="Evaluating", leave=False)
    for batch in progress_bar:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        logits = model(input_ids, attention_mask)
        _, predicted = torch.max(logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        progress_bar.set_postfix({"accuracy": total_correct / total_samples})

accuracy = total_correct / total_samples
print(f'Final Accuracy: {accuracy:.4f}')

                                                                                

Final Accuracy: 0.6429


