In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Read the train_data.txt file
with open('E:/HRISHI/movie/Genre Classification Dataset/train_data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split each line based on the delimiter " ::: "
train_data = []
for line in lines:
    train_data.append(line.strip().split(' ::: '))

# Convert to DataFrame
train_data = pd.DataFrame(train_data, columns=['ID', 'Title', 'Genre', 'Plot'])

# Read the test_data.txt file
with open('E:/HRISHI/movie/Genre Classification Dataset/test_data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split each line based on the delimiter " ::: "
test_data = []
for line in lines:
    test_data.append(line.strip().split(' ::: '))

# Convert to DataFrame
test_data = pd.DataFrame(test_data, columns=['ID', 'Title', 'Plot'])

# Read the test_data_solution.txt file
with open('E:/HRISHI/movie/Genre Classification Dataset/test_data_solution.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split each line based on the delimiter " ::: "
test_solutions = []
for line in lines:
    test_solutions.append(line.strip().split(' ::: '))

# Convert to DataFrame
test_solutions = pd.DataFrame(test_solutions, columns=['ID', 'Title', 'Genre', 'Plot'])

In [3]:
# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
train_data['cleaned_plot'] = train_data['Plot'].apply(preprocess_text)
test_data['cleaned_plot'] = test_data['Plot'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\h1bor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Use TF-IDF vectorizer with reduced number of features
tfidf = TfidfVectorizer(max_features=1000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(train_data['cleaned_plot'])

# Transform the test data
X_test_tfidf = tfidf.transform(test_data['cleaned_plot'])


In [5]:
# Train a logistic regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, train_data['Genre'])

In [6]:
# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(test_solutions['Genre'], y_pred))
print("Classification Report:\n", classification_report(test_solutions['Genre'], y_pred, zero_division=1))

Accuracy: 0.54190036900369
Classification Report:
               precision    recall  f1-score   support

      action       0.36      0.21      0.26      1314
       adult       0.51      0.19      0.28       590
   adventure       0.32      0.07      0.12       775
   animation       0.37      0.05      0.09       498
   biography       1.00      0.00      0.00       264
      comedy       0.47      0.51      0.49      7446
       crime       0.26      0.05      0.08       505
 documentary       0.65      0.83      0.73     13096
       drama       0.51      0.73      0.60     13612
      family       0.35      0.08      0.14       783
     fantasy       0.46      0.04      0.07       322
   game-show       0.80      0.49      0.60       193
     history       0.00      0.00      0.00       243
      horror       0.54      0.45      0.49      2204
       music       0.59      0.43      0.50       731
     musical       0.15      0.02      0.04       276
     mystery       0.31      0