**IMPORTING OF DATASET**

In [5]:
import pandas as pd
import kagglehub
import os
path = kagglehub.dataset_download("hijest/genre-classification-dataset-imdb")
txt_file = None
for root, _, files in os.walk(path):
    for file in files:
        if file.endswith(".txt"):
            txt_file = os.path.join(root, file)
            break
    if txt_file:
        break

if txt_file:
    df = pd.read_csv(txt_file, sep="\t", encoding="latin-1")
    print("Successfully loaded the TXT file.")
    display(df.head())
else:
    print("No TXT file found in the downloaded dataset directory.")
df = pd.read_csv(txt_file, sep="\t")
df.head()

Successfully loaded the TXT file.


Unnamed: 0,Train data:
0,ID ::: TITLE ::: GENRE ::: DESCRIPTION
1,ID ::: TITLE ::: GENRE ::: DESCRIPTION
2,ID ::: TITLE ::: GENRE ::: DESCRIPTION
3,ID ::: TITLE ::: GENRE ::: DESCRIPTION
4,Test data:


Unnamed: 0,Train data:
0,ID ::: TITLE ::: GENRE ::: DESCRIPTION
1,ID ::: TITLE ::: GENRE ::: DESCRIPTION
2,ID ::: TITLE ::: GENRE ::: DESCRIPTION
3,ID ::: TITLE ::: GENRE ::: DESCRIPTION
4,Test data:


**Parsing the FILE and Loading it into DF**

In [6]:
data = []
with open(txt_file, "r") as file:
    for line in file:
      parts = line.strip().split(" ::: ")
      if len(parts) == 4:
        id_, title, genre, plot = parts
        data.append({"id": id_, "title": title, "genre": genre, "plot": plot})

df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,title,genre,plot
0,ID,TITLE,GENRE,DESCRIPTION
1,ID,TITLE,GENRE,DESCRIPTION
2,ID,TITLE,GENRE,DESCRIPTION
3,ID,TITLE,GENRE,DESCRIPTION


**ENCODE THE GENRES**
*using **LabelEncoder** to turn genres into numbers*

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df["genre"])

**USING TF-IDF to Vectorise the PLot Summaries**

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorize = TfidfVectorizer(
    max_features=5000,
    stop_words="english",
    ngram_range=(1, 2),
    sublinear_tf=True
)
X = vectorize.fit_transform(df["plot"])


**TRAIN/TEST SPLIT**

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**TRAIN A CLASSIFIER *Naive Baye*s**

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)

**EVALUATING THE MODEL**

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

      action       0.59      0.09      0.16       263
       adult       0.88      0.06      0.12       112
   adventure       0.33      0.04      0.06       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.50      0.45      0.47      1443
       crime       0.00      0.00      0.00       107
 documentary       0.58      0.88      0.70      2659
       drama       0.46      0.83      0.59      2697
      family       1.00      0.01      0.01       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.17      0.30        40
     history       0.00      0.00      0.00        45
      horror       0.71      0.36      0.47       431
       music       0.81      0.15      0.25       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0.00        56
        news       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **EVALUTING THE MODEL**

In [None]:
plots = ["L.R. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes.",
         "We Owe it to Our Children from the Union Film Enterprise, a family melodrama written and directed by renowned filmmaker Chin Chien (My Intimate Partners).",
         "Vasu Inamdar (Ina) suffers from a disorder where the pictures which he sees comes in his dreams and turn into reality.",
         "Four high school students embark on a terrifying journey through ShadowView Manor 2 years after a horrifying séance gone wrong. Intern Raven, decides to reconnect with her elementary school friends Kota, William, and Jessica by bringing them to her new workplace, ShadowView Manor for a bit of paranormal investigating. Hearing more forbidden secrets from the night janitor sends them into a dark descending spiral of terror."]
for i in range(len(plots)):
  plot_tfidf = vectorize.transform([plots[i]])
  pred = clf.predict(plot_tfidf)
  print("Predicted genre:", le.inverse_transform(pred))

plot_tfidf = vectorize.transform([plot])
pred = clf.predict(plot_tfidf)
print("Predicted genre:", le.inverse_transform(pred))


Predicted genre: ['comedy']
Predicted genre: ['documentary']
Predicted genre: ['drama']
Predicted genre: ['horror']
Predicted genre: ['documentary']
