Step 1: Import Libraries and Load Data

In [4]:
!pip install kaggle



In [11]:
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file

Saving movies.zip to movies.zip


In [18]:
import zipfile

with zipfile.ZipFile('movies.zip', 'r') as zip_ref:
    file_list = zip_ref.namelist()
    print(file_list)


['Genre Classification Dataset/description.txt', 'Genre Classification Dataset/test_data.txt', 'Genre Classification Dataset/test_data_solution.txt', 'Genre Classification Dataset/train_data.txt']


In [32]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import zipfile

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the data from the .txt file
# Inspect the first few lines to determine the correct delimiter
with open('Genre Classification Dataset/train_data.txt', 'r') as file:
    lines = file.readlines()
    for i, line in enumerate(lines[:5]):  # Show the first 5 lines
        print(f"Line {i}: {line.strip()}")

# Based on the provided format, I assume the delimiter between fields is ' ::: '
data = pd.read_csv('Genre Classification Dataset/train_data.txt', delimiter=' ::: ', engine='python', header=None, names=['id', 'title', 'genre', 'plot'])

# Display the first few rows to ensure correct loading
print(data.head())

# Save the DataFrame as a .csv file
data.to_csv('train_data.csv', index=False)
print("Data successfully converted to train_data.csv")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Line 0: 1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.
Line 1: 2 ::: Cupid (1997) ::: thriller ::: A brother and sister with a past incestuous relationship have a current murderous relationship. He murders the women who reject him and she murders the women who get too close to him.
Line 2: 3 ::: Young, Wild and Wonderful (1980) ::: adult ::: As the bus empties the students for their field trip to the Museum of Natural History, little does the tour 

Step 2: Data Preprocessing

In [33]:
# Preprocessing function
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)

    # Lowercase
    tokens = [word.lower() for word in tokens]

    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Apply preprocessing to the 'plot' column
data['processed_plot'] = data['plot'].apply(preprocess_text)

# Display the first few rows to ensure preprocessing is applied correctly
print(data.head())

   id                             title     genre  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                                plot  \
0  Listening in to a conversation between his doc...   
1  A brother and sister with a past incestuous re...   
2  As the bus empties the students for their fiel...   
3  To help their unemployed father make ends meet...   
4  The film's title refers not only to the un-rec...   

                                      processed_plot  
0  listening conversation doctor parent , 10-year...  
1  brother sister past incestuous relationship cu...  
2  bus empty student field trip museum natural hi...  
3  help unemployed father make end meet , edith t...  
4  film 's title refers un-recovered body ground ...  


Step 3: Feature Extraction

In [34]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the processed plot summaries
X = tfidf_vectorizer.fit_transform(data['processed_plot'])
y = data['genre']


Step 4: Train-Test Split

In [35]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Step 5: Model Training and Hyperparameter Tuning

In [None]:
# Define Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

# Hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

grid_search_log_reg = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search_log_reg.fit(X_train, y_train)

print("Best parameters for Logistic Regression:", grid_search_log_reg.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
# Define Naive Bayes model
nb = MultinomialNB()

# Hyperparameter tuning
param_grid_nb = {
    'alpha': [0.01, 0.1, 1, 10]
}

grid_search_nb = GridSearchCV(nb, param_grid_nb, cv=5, scoring='accuracy')
grid_search_nb.fit(X_train, y_train)

print("Best parameters for Naive Bayes:", grid_search_nb.best_params_)


In [None]:
# Define Support Vector Machine model
svc = SVC()

# Hyperparameter tuning
param_grid_svc = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

grid_search_svc = GridSearchCV(svc, param_grid_svc, cv=5, scoring='accuracy')
grid_search_svc.fit(X_train, y_train)

print("Best parameters for SVM:", grid_search_svc.best_params_)


Step 6: Model Evaluation

In [None]:
# Evaluate models
models = {
    'Logistic Regression': grid_search_log_reg,
    'Naive Bayes': grid_search_nb,
    'SVM': grid_search_svc
}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


Step 7: Save the Best Model

In [None]:
# Save the best model
best_model = grid_search_log_reg if grid_search_log_reg.best_score_ > grid_search_nb.best_score_ else grid_search_nb
best_model = best_model if best_model.best_score_ > grid_search_svc.best_score_ else grid_search_svc

joblib.dump(best_model, 'best_movie_genre_model.pkl')
print("Best model saved as 'best_movie_genre_model.pkl'")


Step 8: Load and Use the Model

In [None]:
# Load the model
loaded_model = joblib.load('best_movie_genre_model.pkl')

# Predict genre for new plot summary
new_plot = "A group of friends discover a hidden treasure map and set out on an adventure to find the treasure."
processed_plot = preprocess_text(new_plot)
vectorized_plot = tfidf_vectorizer.transform([processed_plot])
predicted_genre = loaded_model.predict(vectorized_plot)

print("Predicted Genre:", predicted_genre[0])
