In [1]:
import pandas as pd
import tensorflow as tf
import nltk

In [2]:
train_path='train_data.txt'
test_path='test_data.txt'

In [3]:
train_data=[]
with open(train_path,'r',encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split(' ::: ')
        if len(parts)==4:
            movie_id,title,genre,description=parts
            train_data.append({'id':movie_id,'title':title,'genre':genre,'description':description})
            
#conversion into dataframe:
train_df=pd.DataFrame(train_data)

In [4]:
test_data=[]
with open(test_path,'r',encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split(' ::: ')
        if len(parts)==3:
            movie_id,title,description=parts
            test_data.append({'id':movie_id,'title':title,'description':description})
            
#conversion into dataframe:
test_df=pd.DataFrame(test_data)

In [5]:
train_df.head()

Unnamed: 0,id,title,genre,description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [6]:
test_df.head()

Unnamed: 0,id,title,description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


## Preprocessing

In [7]:
import nltk
from nltk.corpus import stopwords
import string

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
#Combining title and description into a single column
train_df['text']=train_df['title']+' '+train_df['description']
test_df['text']=test_df['title']+' '+test_df['description']

In [10]:
#removing punctuation and stop words
stop_words=set(stopwords.words('english'))

def preprocess(text):
    text=text.lower()
    text=text.translate(str.maketrans('','',string.punctuation))
    text= ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [11]:
train_df['text']=train_df['text'].apply(preprocess)
test_df['text']=test_df['text'].apply(preprocess)

## Label Encoding

In [12]:
train_df.head()

Unnamed: 0,id,title,genre,description,text
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...,oscar et la dame rose 2009 listening conversat...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...,cupid 1997 brother sister past incestuous rela...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...,young wild wonderful 1980 bus empties students...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...,secret sin 1915 help unemployed father make en...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...,unrecovered 2007 films title refers unrecovere...


In [13]:
from tensorflow.keras.layers import StringLookup

genre_lookup=StringLookup(output_mode='int',num_oov_indices=0)
genre_lookup.adapt(train_df['genre'].unique())

train_df['encoded_genre']=genre_lookup(train_df['genre'])

print(train_df['encoded_genre'].head())

0    18
1     2
2    25
3    18
4    18
Name: encoded_genre, dtype: int64


## Tokenize and Padding

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
#parameters
max_features = 5000
max_length = 500


#Tokenize
tokenizer=Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_df['text'])

X_train_sequences = tokenizer.texts_to_sequences(train_df['text'])
X_test_sequences = tokenizer.texts_to_sequences(test_df['text'])

#Padding
X_train_padded=pad_sequences(X_train_sequences, maxlen=max_length)
X_test_padded=pad_sequences(X_test_sequences, maxlen=max_length)


In [16]:
#prepare labels
y_train = train_df['encoded_genre']

## Creating Validation Set

In [17]:
from sklearn.model_selection import train_test_split

X_train , X_val , y_train , y_val = train_test_split(X_train_padded , y_train, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

num_classes = len(genre_lookup.get_vocabulary())


X_train shape: (43371, 500)
X_val shape: (10843, 500)
y_train shape: (43371,)
y_val shape: (10843,)


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding , LSTM , Dense , Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [19]:
def create_model(max_features,max_length,num_classes,learning_rate):
    model=Sequential([
        Embedding(input_dim=max_features, output_dim=128, input_length=max_length),
        LSTM(128, return_sequences=True),
        LSTM(64),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
        
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    
    return model


In [20]:
def train_model(model,X_train, y_train, X_val, y_val, epochs=10,batch_size=32):
    history=model.fit(X_train, y_train ,epochs=epochs, batch_size=batch_size,
                     validation_data=(X_val, y_val), verbose=1)
    
    return history

In [21]:
def evaluate_model(model,X_test,y_test):
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test Loss: {test_loss:.4f}')
    print(f'Test Accuracy: {test_accuracy:.4f}')
    
    return test_loss, test_accuracy

In [22]:
def plot_history(history):
    plt.figure(figsize=(14,6))
    
    #training and validation accuracy values
    plt.subplot(1,2,1)
    plt.plot(history.history['accuracy'],label='Train Accuracy')
    plt.plot(history.history['val_accuracy'],label='Val Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    #training and validation loss values
    plt.subplot(1,2,2)
    plt.plot(history.history['loss'],label='Train Loss')
    plt.plot(history.history['val_loss'],label='Val Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

## Tuning Hyperparameters

In [23]:
learning_rate=0.001
epochs=10
batch_size=32

In [None]:
model=create_model(max_features, max_length, num_classes, learning_rate)

history=train_model(model,X_train, y_train, X_val, y_val, epochs=epochs, batch_size=batch_size)

test_loss, test_accuracy = evaluate_model(model, X_test_padded, y_test)


plot_history(history)

Epoch 1/10
[1m 181/1356[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m5:30[0m 282ms/step - accuracy: 0.1934 - loss: 2.8455

In [24]:
import tensorflow as tf
print(len(tf.config.experimental.list_physical_devices('GPU')))

0
