In [1]:
!pip install pandas numpy scikit-learn matplotlib




In [2]:
from google.colab import files
uploaded = files.upload()


Saving archive.zip to archive.zip


In [5]:
import zipfile
import os

with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
    zip_ref.extractall()


In [8]:
import os
print(os.listdir())


['.config', 'Genre Classification Dataset', 'archive.zip', 'sample_data']


In [9]:
import os

# List files in the 'Genre Classification Dataset' directory
print(os.listdir('Genre Classification Dataset'))


['test_data.txt', 'test_data_solution.txt', 'description.txt', 'train_data.txt']


**Reading and Inspecting the Files**

In [10]:
# Function to read and display the first few lines of a file
def read_file(file_path, num_lines=5):
    with open(file_path, 'r') as file:
        for _ in range(num_lines):
            print(file.readline().strip())

# Display the first few lines of each file
print("Train Data:")
read_file('Genre Classification Dataset/train_data.txt')

print("\nTest Data:")
read_file('Genre Classification Dataset/test_data.txt')

print("\nTest Data Solution:")
read_file('Genre Classification Dataset/test_data_solution.txt')

print("\nDescription:")
read_file('Genre Classification Dataset/description.txt')


Train Data:
1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.
2 ::: Cupid (1997) ::: thriller ::: A brother and sister with a past incestuous relationship have a current murderous relationship. He murders the women who reject him and she murders the women who get too close to him.
3 ::: Young, Wild and Wonderful (1980) ::: adult ::: As the bus empties the students for their field trip to the Museum of Natural History, little does the tour guide suspec

**Loading the Data**

In [23]:
import pandas as pd

# Read the train data
train_data = []
with open('Genre Classification Dataset/train_data.txt', 'r') as file:
    for line in file:
        parts = line.strip().split(' ::: ')
        if len(parts) == 4:
            train_data.append(parts)

# Convert to DataFrame
df_train = pd.DataFrame(train_data, columns=['id', 'title', 'genre', 'plot'])

# Display the first few rows
df_train.head()


Unnamed: 0,id,title,genre,plot
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
_df_27.groupby('plot').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

**Preprocessing the Data**

In [14]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])  # Remove stop words
    return text

df_train['plot'] = df_train['plot'].apply(preprocess_text)


**Feature Extraction**

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df_train['plot']).toarray()


**Encoding Labels**

In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df_train['genre'])


**Train-Test Split**

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Training the Model**

In [18]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


**Evaluating the Model**

In [19]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.5758553905745643
              precision    recall  f1-score   support

      action       0.54      0.27      0.36       263
       adult       0.73      0.21      0.33       112
   adventure       0.44      0.14      0.21       139
   animation       0.67      0.10      0.17       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.58      0.54      1443
       crime       0.33      0.02      0.04       107
 documentary       0.66      0.84      0.74      2659
       drama       0.54      0.77      0.63      2697
      family       0.41      0.08      0.13       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.95      0.47      0.63        40
     history       0.00      0.00      0.00        45
      horror       0.62      0.56      0.59       431
       music       0.66      0.48      0.56       144
     musical       0.50      0.02      0.04        50
     mystery       0.00      0.00      0.00        5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Saving the Model**

In [20]:
import joblib

joblib.dump(model, 'genre_classification_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']

**Loading and Predicting**

In [21]:
# Load the model and vectorizer
model = joblib.load('genre_classification_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')
le = joblib.load('label_encoder.pkl')

# Example prediction
new_plot = ["A young wizard discovers his magical heritage and attends a magical school."]
new_plot_preprocessed = [preprocess_text(plot) for plot in new_plot]
new_plot_tfidf = tfidf.transform(new_plot_preprocessed).toarray()
predicted_genre = model.predict(new_plot_tfidf)
predicted_genre_label = le.inverse_transform(predicted_genre)

print(predicted_genre_label)


['short']
