In [205]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [206]:
train_path = r"C:\Users\gowth\OneDrive\Desktop\ProgrammingProjects\CodSoft\movie_classification\Genre Classification Dataset\train_data.txt"
train_data = pd.read_csv(train_path, sep=':::', names=['Title', 'Genre', 'Description'], engine ='python')

In [207]:
print(train_data.describe())

                                               Title    Genre  \
count                                          54214    54214   
unique                                         54214       27   
top      Nature's Fury: Storm of the Century (2006)    drama    
freq                                               1    13613   

                                              Description  
count                                               54214  
unique                                              54086  
top      Grammy - music award of the American academy ...  
freq                                                   12  


In [208]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 54214 entries, 1 to 54214
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        54214 non-null  object
 1   Genre        54214 non-null  object
 2   Description  54214 non-null  object
dtypes: object(3)
memory usage: 1.7+ MB
None


In [209]:
test_path = r"C:\Users\gowth\OneDrive\Desktop\ProgrammingProjects\CodSoft\movie_classification\Genre Classification Dataset\test_data.txt"
test_data = pd.read_csv(test_path, sep=':::', names=['Id','Title','Description'], engine='python')
test_data.head()

Unnamed: 0,Id,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [210]:
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'@\S+', '', text)  # Remove @mentions
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'pic.\S+', '', text)  # Remove pic links
    text = re.sub(r"[^a-zA-Z\s]", '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = "".join([i for i in text if i not in string.punctuation])  # Remove punctuation
    words = nltk.word_tokenize(text)  # Tokenize the text
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [stemmer.stem(word) for word in words]  # Stem the words
    text = " ".join(words)  # Join the words back into a single string
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespace
    return text

train_data['Text_cleaning'] = train_data['Description'].apply(clean_text)
test_data['Text_cleaning'] = test_data['Description'].apply(clean_text)


In [211]:
print(len(train_data['Text_cleaning']))
print(len(train_data['Genre']))

54214
54214


In [212]:
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_data['Text_cleaning'])
X_test = tfidf_vectorizer.transform(test_data['Text_cleaning'])

In [213]:
print(train_data['Text_cleaning'])

1        list convers doct par yearold osc learn nobody...
2        broth sist past incestu rel cur murd rel murd ...
3        bus empty stud field trip muse nat hist littl ...
4        help unemploy fath mak end meet edi twin sist ...
5        film titl ref unrecov body ground zero also st...
                               ...                        
54210    shortl nbc liv sitcom cent bonino worldfam con...
54211    next gen exploit sist kap bay sor hous mystery...
54212    ze besta echt standup comedy grow fac fear fre...
54213    walt viv liv country difficult tim keep serv w...
54214    lab day weekend intens hur ev mak landfal amer...
Name: Text_cleaning, Length: 54214, dtype: object


In [214]:
y = train_data['Genre']
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.8, random_state=42)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.430784838144425


In [215]:
X_test_predict = classifier.predict(X_test)
test_data['Predicted_Genre'] = X_test_predict

In [216]:
test_data.to_csv('predicted_genres.csv', index = False)
print(test_data)

          Id                             Title  \
0          1             Edgar's Lunch (1998)    
1          2         La guerra de papá (1977)    
2          3      Off the Beaten Track (2010)    
3          4           Meu Amigo Hindu (2015)    
4          5                Er nu zhai (1955)    
...      ...                               ...   
54195  54196   "Tales of Light & Dark" (2013)    
54196  54197      Der letzte Mohikaner (1965)    
54197  54198              Oliver Twink (2007)    
54198  54199                Slipstream (1973)    
54199  54200        Curitiba Zero Grau (2010)    

                                             Description  \
0       L.R. Brane loves his life - his car, his apar...   
1       Spain, March 1964: Quico is a very naughty ch...   
2       One year in the life of Albin and his family ...   
3       His father has died, he hasn't spoken with hi...   
4       Before he was known internationally as a mart...   
...                                    

In [219]:
def classify_genre(description):
    cleaned_description = clean_text(description)
    X = tfidf_vectorizer.transform([cleaned_description])
    predicted_genre = classifier.predict(X)
    return predicted_genre


In [220]:
description = input("Enter the description of your movie")
predicted_genre = classify_genre(description)
print(f"Predicted Genre: {predicted_genre}")


Enter the description of your movie  A brother and sister with a past incestuous relationship have a current murderous relationship. He murders the women who reject him and she murders the women who get too close to him.


Predicted Genre: [' drama ']
