### ***Movie Genre Classification Using Machine Learning Models***

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load and preprocess the training dataset
train_path = "O:/virtual_intern/Codsoft-Machine Learning/Movie Gerne Prediction/Genre Classification Dataset/train_data.txt"

train_df = pd.read_csv(train_path, sep=':::', names=['Title', 'Genre', 'Description'], engine='python')

train_df.shape

(54214, 3)

In [3]:
train_df

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...
54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [4]:
# Combine title and description into a single text field
train_df['text'] = train_df['Title'] + ' ' + train_df['Description']

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['Genre'], test_size=0.2, random_state=42)

print(X_train.shape,y_train.shape,X_test.shape)

(43371,) (43371,) (10843,)


In [6]:
# Feature extraction using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limiting features for better performance
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.fit_transform(X_test)

# Encode target labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [8]:
# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(kernel='linear')
}


In [9]:
# Train and evaluate each classifier
best_clf = None
best_accuracy = 0

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"=== {name} ===")
    print(f"Accuracy: {accuracy*100:.2f}%")
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
    print()
    
    if (accuracy > best_accuracy):
        best_accuracy = accuracy
        best_clf = clf

# Print the best classifier
print(f"The best classifier is: {best_clf}")
print(f"With an accuracy of: {best_accuracy*100:.2f}%")

=== Logistic Regression ===
Accuracy: 58.48%
Classification Report:
                precision    recall  f1-score   support

      action        0.50      0.25      0.33       263
       adult        0.72      0.23      0.35       112
   adventure        0.46      0.15      0.23       139
   animation        0.64      0.09      0.15       104
   biography        0.00      0.00      0.00        61
      comedy        0.52      0.58      0.55      1443
       crime        0.43      0.03      0.05       107
 documentary        0.67      0.85      0.75      2659
       drama        0.54      0.78      0.64      2697
      family        0.40      0.08      0.13       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.95      0.45      0.61        40
     history        0.00      0.00      0.00        45
      horror        0.67      0.60      0.63       431
       music        0.61      0.48      0.54       144
     musical        0.67      0.04      0.08      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


=== Support Vector Machine ===
Accuracy: 59.23%
Classification Report:
                precision    recall  f1-score   support

      action        0.45      0.35      0.39       263
       adult        0.69      0.44      0.54       112
   adventure        0.42      0.20      0.27       139
   animation        0.45      0.13      0.21       104
   biography        0.00      0.00      0.00        61
      comedy        0.52      0.61      0.56      1443
       crime        0.14      0.02      0.03       107
 documentary        0.69      0.83      0.75      2659
       drama        0.56      0.75      0.64      2697
      family        0.32      0.09      0.14       150
     fantasy        0.33      0.04      0.07        74
   game-show        0.85      0.55      0.67        40
     history        0.00      0.00      0.00        45
      horror        0.65      0.63      0.64       431
       music        0.64      0.58      0.61       144
     musical        0.71      0.10      0.18   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Best Classifier and Accuracy

The best performing classifier for movie genre classification is the Support Vector Machine (SVM) with a linear kernel.

- **Classifier:** SVC(kernel='linear')
- **Accuracy:** 59.23%

This model achieved the highest accuracy among the evaluated classifiers (Logistic Regression, Naive Bayes, and SVM) when predicting movie genres based on combined title and description text data.

The classification report summarizes the performance of the SVM classifier (with linear kernel) for predicting movie genres based on title and description.

- **Precision**: Precision indicates the proportion of correctly predicted instances among the predicted instances for each genre.
  - Genres with higher precision values (e.g., documentary, music, western) were predicted accurately when the classifier identified them.

- **Recall**: Recall represents the proportion of correctly predicted instances among the actual instances of each genre.
  - Genres with higher recall values (e.g., documentary, drama, horror) indicate that the classifier effectively captured a large portion of these genres in its predictions.

Overall, the SVM model achieved an accuracy of 59.23%, demonstrating better performance in genres such as documentary and drama. However, genres like biography, fantasy, and romance were challenging for the model to predict accurately, as indicated by lower precision and recall scores.

In [12]:
# Load new dataset for predictions
predict_path = "O:/virtual_intern/Codsoft-Machine Learning/Movie Gerne Prediction/Genre Classification Dataset/test_data.txt"
predict_df = pd.read_csv(predict_path, sep=':::', names=['Title', 'Description'], engine='python')
predict_df

Unnamed: 0,Title,Description
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...
54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [13]:
# Combine title and description into a single text field
predict_df['text'] = predict_df['Title'] + ' ' + predict_df['Description']

In [14]:
# Transform the new data using the same TF-IDF vectorizer
X_predict = vectorizer.fit_transform(predict_df['text'])

In [15]:
# Predict genres for the new data
predictions = best_clf.predict(X_predict)
predicted_genres = le.inverse_transform(predictions)

In [18]:
# Add predicted genres to the dataframe
predict_df['Predicted Genre'] = predicted_genres

predict_df.drop(['text'],axis=1, inplace=True)


In [19]:
predict_df

Unnamed: 0,Title,Description,Predicted Genre
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar...",comedy
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch...",comedy
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...,short
4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi...",documentary
5,Er nu zhai (1955),Before he was known internationally as a mart...,drama
...,...,...,...
54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da...",documentary
54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...,drama
54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...,short
54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard...",documentary


In [20]:
predict_df.to_csv("O:/virtual_intern/Codsoft-Machine Learning/Movie Gerne Prediction/genre_prediction_on_new data.csv")