In [20]:
# Import necessary libraries
import pandas as pd

# Load the data
# train_data_path = '/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt'
# test_data_path = '/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt'

train_data_path = r'C:\Users\Kunjal Thorat\Desktop\Genre Classification Dataset\train_data.txt'
test_data_path = r'C:\Users\Kunjal Thorat\Desktop\Genre Classification Dataset\test_data_solution.txt'

# Load the training data
train_df = pd.read_csv(train_data_path, sep=':::', engine='python', names=['Title', 'Genre', 'Description'])

# Load the test data
test_df = pd.read_csv(test_data_path, sep=':::', engine='python', names=['Title', 'Genre', 'Description'])

# Inspect the data
print("Training Data - First 5 rows:")
train_df.head()




Training Data - First 5 rows:


Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [21]:
print("\nTest Data - First 5 rows:")
test_df.head()


Test Data - First 5 rows:


Unnamed: 0,Title,Genre,Description
1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


In [22]:
# Data Cleaning
# Handling missing values (if any)
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Removing duplicates
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

# Converting text to lowercase
train_df['Description'] = train_df['Description'].str.lower()
test_df['Description'] = test_df['Description'].str.lower()

# Verify the changes
print("\nTraining Data - After Data Cleaning:")
train_df.head()



Training Data - After Data Cleaning:


Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,listening in to a conversation between his do...
2,Cupid (1997),thriller,a brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,as the bus empties the students for their fie...
4,The Secret Sin (1915),drama,to help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,the film's title refers not only to the un-re...


In [23]:

print("\nTest Data - After Data Cleaning:")
test_df.head()


Test Data - After Data Cleaning:


Unnamed: 0,Title,Genre,Description
1,Edgar's Lunch (1998),thriller,"l.r. brane loves his life - his car, his apar..."
2,La guerra de papá (1977),comedy,"spain, march 1964: quico is a very naughty ch..."
3,Off the Beaten Track (2010),documentary,one year in the life of albin and his family ...
4,Meu Amigo Hindu (2015),drama,"his father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),drama,before he was known internationally as a mart...


In [6]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

In [7]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Kunjal
[nltk_data]     Thorat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Kunjal
[nltk_data]     Thorat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# Text Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Apply stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(stemmed_tokens)

# Apply text preprocessing to the 'Description' column
train_df['Description'] = train_df['Description'].apply(preprocess_text)
test_df['Description'] = test_df['Description'].apply(preprocess_text)

# Verify the changes
print("\nTraining Data - After Text Preprocessing:")
train_df.head()


Training Data - After Text Preprocessing:


Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,listen convers doctor parent oscar learn nobod...
2,Cupid (1997),thriller,brother sister past incestu relationship curre...
3,"Young, Wild and Wonderful (1980)",adult,bu empti student field trip museum natur histo...
4,The Secret Sin (1915),drama,help unemploy father make end meet edith twin ...
5,The Unrecovered (2007),drama,film titl refer bodi ground zero also state na...


In [25]:
print("\nTest Data - After Text Preprocessing:")
test_df.head()


Test Data - After Text Preprocessing:


Unnamed: 0,Title,Genre,Description
1,Edgar's Lunch (1998),thriller,brane love life car apart job especi girlfrien...
2,La guerra de papá (1977),comedy,spain march quico naughti child three belong w...
3,Off the Beaten Track (2010),documentary,one year life albin famili shepherd north tran...
4,Meu Amigo Hindu (2015),drama,father die spoken brother year seriou cancer d...
5,Er nu zhai (1955),drama,known intern martial art superstar bruce lee a...


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply text preprocessing to the 'Description' column
train_df['Description'] = train_df['Description'].apply(preprocess_text)
test_df['Description'] = test_df['Description'].apply(preprocess_text)

# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['Description'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['Description'])

# Verify the changes
print("\nTraining Data - After Feature Extraction (TF-IDF):")
X_train_tfidf.toarray()


Training Data - After Feature Extraction (TF-IDF):


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
print("\nTest Data - After Feature Extraction (TF-IDF):")
X_test_tfidf.toarray()


Test Data - After Feature Extraction (TF-IDF):


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

X = X_train_tfidf
y = train_df['Genre']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

# Model Training
# The model is already trained in the previous step

# Model Evaluation
# Make predictions on the validation set
y_pred = naive_bayes.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
classification_rep = classification_report(y_val, y_pred, zero_division=0)

print("Model Evaluation (Naive Bayes):")
print(f"Accuracy: {accuracy}")
print(classification_rep)

Model Evaluation (Naive Bayes):
Accuracy: 0.5163700083002859
               precision    recall  f1-score   support

      action        0.47      0.05      0.10       263
       adult        0.64      0.08      0.14       118
   adventure        0.77      0.06      0.12       155
   animation        0.00      0.00      0.00       100
   biography        0.00      0.00      0.00        53
      comedy        0.51      0.42      0.46      1490
       crime        0.00      0.00      0.00       101
 documentary        0.56      0.88      0.68      2619
       drama        0.46      0.82      0.59      2723
      family        0.00      0.00      0.00       157
     fantasy        0.00      0.00      0.00        65
   game-show        1.00      0.05      0.10        39
     history        0.00      0.00      0.00        49
      horror        0.79      0.37      0.50       441
       music        0.67      0.05      0.10       146
     musical        0.00      0.00      0.00        55
   

In [28]:
# Predictions on the Test Data
# Make predictions on the test data using the trained model
test_predictions = naive_bayes.predict(X_test_tfidf)

# Create a DataFrame with the test predictions
test_results = pd.DataFrame({'Title': test_df['Title'], 'Predicted_Genre': test_predictions})

# Display the test predictions
print("\nTest Data Predictions:")
test_results.head()


Test Data Predictions:


Unnamed: 0,Title,Predicted_Genre
1,Edgar's Lunch (1998),drama
2,La guerra de papá (1977),drama
3,Off the Beaten Track (2010),documentary
4,Meu Amigo Hindu (2015),drama
5,Er nu zhai (1955),drama


In [29]:
# Predictions on the Test Data
# Make predictions on the test data using the trained model
test_predictions = naive_bayes.predict(X_test_tfidf)

# Example plot summary
example_plot_summary = [{'description': "Whether it's blocking up mouse holes, running from Landlords or making puppet shows in the bath, it's never a dull moment for The Young Professionals. Desperate to break into the online world and escape the terrors of temping, Natalie presents the lives of six housemates struggling to get on the career ladder after uni and pay their rent on time. Which is all helped along with Keara - the one with the 'real' job."}]

# Extract the description from the example
example_description = example_plot_summary[0]['description']

# Preprocess the description
example_description = example_description.lower()  # Convert to lowercase
example_description = preprocess_text(example_description)  # Apply text preprocessing

# Transform the example description using the TF-IDF vectorizer
example_description_tfidf = tfidf_vectorizer.transform([example_description])

# Make predictions
example_predictions = naive_bayes.predict(example_description_tfidf)

# Print the predicted genre
print("Predicted Genre:", example_predictions[0])

Predicted Genre:  comedy 


In [30]:
# New example plot summary
new_example_plot_summary = [{'description': 'A great leader was killed at the end of the 20th century. His name was Fame Douglas, and he was renowned as the sponsor of the legendary Dead or Alive World Combat Championship. Since his death, and in the absence of his charisma and leadership, the world has become chaotic! Yet something appears to be transpiring.'}]

# Extract the description from the new example
new_example_description = new_example_plot_summary[0]['description']

# Preprocess the description
new_example_description = new_example_description.lower()  # Convert to lowercase
new_example_description = preprocess_text(new_example_description)  # Apply text preprocessing

# Transform the new example description using the TF-IDF vectorizer
new_example_description_tfidf = tfidf_vectorizer.transform([new_example_description])

# Make predictions
new_example_predictions = naive_bayes.predict(new_example_description_tfidf)

# Print the predicted genre
print("Predicted Genre:", new_example_predictions[0])

Predicted Genre:  documentary 
