###  Importing Library 

In [21]:
import numpy as np

In [22]:
import pandas as pd

In [23]:
import nltk

In [24]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
from nltk.corpus import stopwords

In [26]:
from nltk.stem.porter import PorterStemmer

In [27]:
import string

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
from sklearn.naive_bayes import MultinomialNB

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
from sklearn.metrics import accuracy_score, precision_score

In [33]:
from gensim.models import Word2Vec

In [34]:
from sklearn.ensemble import RandomForestClassifier

### Dataframe creation

In [35]:
df = pd.read_csv('train_data.txt', sep=':::', header=None, engine='python')
columns = ['ID', 'NAME', 'GENRE', 'MOVIE_TEXT']
df.columns = columns

In [37]:
df

Unnamed: 0,ID,NAME,GENRE,MOVIE_TEXT
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


### Label Encoding

In [38]:
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['GENRE'])
y = df['Label'].values

### Data Extraction

In [39]:
df['Text_model'] = (df['MOVIE_TEXT'] + " " + df['NAME']).str.lower()

### Function creation

In [40]:
ps = PorterStemmer()

In [41]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    z = [ps.stem(word) for word in text if word.isalnum() and word not in stopwords.words('english')]
    return z

In [42]:
df['tokenized_text'] = df['Text_model'].apply(transform_text)

In [43]:
word2vec_model = Word2Vec(sentences=df['tokenized_text'], vector_size=100, window=5, min_count=2, workers=4)

In [44]:
def get_average_word2vec(tokens, model, size):
    vec = np.zeros(size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

In [45]:
df['embeddings'] = df['tokenized_text'].apply(lambda x: get_average_word2vec(x, word2vec_model, 100))

In [46]:
X = np.vstack(df['embeddings'].values)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

### Logistic Regression

In [48]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [49]:
print("Logistic Regression Accuracy (Word2Vec):", accuracy_score(y_test, y_pred_lr))

Logistic Regression Accuracy (Word2Vec): 0.5373051738448769


### Random Forest Classifier

In [50]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

In [51]:
print("Random Forest Classifier Accuracy (Word2Vec):", accuracy_score(y_test, y_pred_rfc))

Random Forest Classifier Accuracy (Word2Vec): 0.5136954717329153
