In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [48]:
train_data=pd.read_csv( 'train_data.txt' , sep=':::',engine='python',names=['title','genre','description'])
train_data.head()

Unnamed: 0,title,genre,description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [50]:
test_data=pd.read_csv( 'test_data.txt' , sep=':::',engine='python',names=['id', 'title','description'])
test_data.head()

Unnamed: 0,id,title,description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


# Data Preprocessing

In [51]:
train_data.isnull().sum()

title          0
genre          0
description    0
dtype: int64

In [53]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54214 entries, 1 to 54214
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        54214 non-null  object
 1   genre        54214 non-null  object
 2   description  54214 non-null  object
dtypes: object(3)
memory usage: 1.7+ MB


In [54]:
from collections import Counter

Counter(train_df["genre"]).most_common()

[(' drama ', 13613),
 (' documentary ', 13096),
 (' comedy ', 7447),
 (' short ', 5073),
 (' horror ', 2204),
 (' thriller ', 1591),
 (' action ', 1315),
 (' western ', 1032),
 (' reality-tv ', 884),
 (' family ', 784),
 (' adventure ', 775),
 (' music ', 731),
 (' romance ', 672),
 (' sci-fi ', 647),
 (' adult ', 590),
 (' crime ', 505),
 (' animation ', 498),
 (' sport ', 432),
 (' talk-show ', 391),
 (' fantasy ', 323),
 (' mystery ', 319),
 (' musical ', 277),
 (' biography ', 265),
 (' history ', 243),
 (' game-show ', 194),
 (' news ', 181),
 (' war ', 132)]

In [55]:
train_data['length']=train_data['description'].apply(len)
train_data.head()

Unnamed: 0,title,genre,description,length
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,546
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,184
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,650
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,1082
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,625


In [56]:
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
import re
import string

In [57]:
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
   
    text = text.lower()                             
    text = re.sub('-',' ',text.lower())
    text = re.sub(f'[{string.digits}]',' ',text)
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words]) 
    text =  re.sub(r'@\S+', '',text) 
    text =  re.sub(r'http\S+', '',text) 
    text =  re.sub(r'pic.\S+', '',text) 
    text =  re.sub(r"[^a-zA-Z+']", ' ',text)     
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ')   
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.tokenize.word_tokenize(text,language="english", preserve_line=True)
    stopwords = nltk.corpus.stopwords.words('english') 
    text = " ".join([i for i in words if i not in stopwords and len(i)>2])
    text= re.sub("\s[\s]+", " ",text).strip() 
    return re.sub(f'[{re.escape(string.punctuation)}]','',text)


In [59]:
input_text = "Hello you look wonderful today"
print(input_text)
print(clean(input_text))

Hello you look wonderful today
hello look wond today


In [60]:
train_data['Text_cleaning'] = train_data.description.apply(clean_text)
test_data['Text_cleaning'] = test_data.description.apply(clean_text)

train_data.head()

Unnamed: 0,title,genre,description,length,Text_cleaning
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,546,list convers doct parents year old osc learn n...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,184,broth sist past incestu rel cur murd relations...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,650,bus empty stud field trip muse nat history lit...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,1082,help unemploy fath mak end meet edi twin sist ...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,625,films titl ref recov body ground zero also sta...


In [61]:
train_data['length_Text_cleaning']=train_data['Text_cleaning'].apply(len)
train_data.head()

Unnamed: 0,title,genre,description,length,Text_cleaning,length_Text_cleaning
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,546,list convers doct parents year old osc learn n...,324
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,184,broth sist past incestu rel cur murd relations...,83
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,650,bus empty stud field trip muse nat history lit...,325
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,1082,help unemploy fath mak end meet edi twin sist ...,616
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,625,films titl ref recov body ground zero also sta...,282


In [62]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [63]:
num_words = 50000
max_len = 250
tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_data['Text_cleaning'].values)

In [70]:
test_data_solution=pd.read_csv( 'test_data_solution.txt' , sep=':::',engine='python',names=['id','title','genre','description'])
test_data_solution.head()

Unnamed: 0,id,title,genre,description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


Using TF-IDF

In [66]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [72]:
X = tokenizer.texts_to_sequences(train_data['Text_cleaning'].values)
X = pad_sequences(X, maxlen=max_len)
y = pd.get_dummies(train_data['genre']).values

X_test = tokenizer.texts_to_sequences(test_data['Text_cleaning'].values)
X_test = pad_sequences(X_test, maxlen=max_len)
y_test = pd.get_dummies(test_data_solution['genre']).values

# LSTM

In [75]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.models import Sequential

In [76]:
model = Sequential()
EMBEDDING_DIM = 100
model.add(Embedding(num_words, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.2))
model.add(Dense(27, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [78]:
from tensorflow.keras.callbacks import EarlyStopping

In [79]:
Callbacks  = [EarlyStopping(monitor='val_loss', min_delta=0, patience=2, mode='auto')]
history = model.fit(X, y, epochs=6, batch_size=32,validation_data=(X_test,y_test), callbacks= Callbacks)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6


In [82]:
from sklearn.metrics import f1_score

In [94]:
y_pred = model.predict(X_test)
predicted_labels = y_pred.argmax(axis=1)
true_labels = y_test.argmax(axis=1)

f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"F1 Score: {f1}")


F1 Score: 0.514288704951581
