In [12]:
# download necessary NLTK data
import nltk
nltk.download(['punkt', 'wordnet'])

# import libraries
import nltk
import numpy as np
import pandas as pd
import pickle
import re
import sqlalchemy as sa
import sys

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

[nltk_data] Downloading package punkt to /home/draylson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/draylson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
engine = sa.create_engine('sqlite:///../data/IMDB.db')
df = pd.read_sql_table('GenreByDescription', engine)
engine.dispose()
df.head()

Unnamed: 0,description,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,A documentary that focuses on a dangerously le...,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Following five years in the life and career of...,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Dr. Steven Greer examines details surrounding ...,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Two brothers owe money to the wrong guy at the...,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Blind climber Jesse Dufton&apos;s ascent of th...,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X = df.description.values
Y = df.iloc[:, 1:].values
category_names = list(df.columns[1:].values)

In [6]:
X

array(['A documentary that focuses on a dangerously legendary water park and its slew of injuries and crimes along with child safety concerns.',
       'Following five years in the life and career of an independent filmmaker, supported by dozens of interviews, posing one question: how does an indie filmmaker survive in the current film business?',
       'Dr. Steven Greer examines details surrounding his alien-visitation &quot;disclosure&quot; movement.',
       ...,
       'Confronted by France&apos;s conservative surrogacy laws, a gay couple, travel to Las Vegas to start a family through surrogacy. This film shows the lengths many gay couples go to have children, examining the controversial surrogacy ind...',
       'On the first day of freedom, a grizzled ex-con must reconnect with his troubled son before his violent past catches up with them.',
       'Two lost souls find each other in desperate and harsh circumstances.'],
      dtype=object)

In [7]:
Y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
category_names

['Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Thriller',
 'War',
 'Western']

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [15]:
def tokenize(text):
    '''
    Extract tokens from a text.

    INPUT:
    text - Text from which tokens will be extracted
    OUTPUT:
    clean_tokens - Resulting tokens.
    '''
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [16]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, token_pattern=None)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])
parameters = {
    'clf__estimator__n_estimators': [50, 100, 200],
    'clf__estimator__max_features': ['auto', 'sqrt'],
    'clf__estimator__criterion': ['entropy', 'gini']
}
model = GridSearchCV(pipeline, param_grid=parameters)

In [17]:
model.fit(X_train, Y_train)

KeyboardInterrupt: 

In [None]:
# predict on test data
Y_pred = model.predict(X_test)
# Iterate through the columns of y_test and y_pred
for i in range(Y_pred.shape[1]):
    true_labels = Y_test[:, i]
    predicted_labels = Y_pred[:, i]

    # Generate classification report for the current column
    report = classification_report(true_labels, predicted_labels)

    # Print the classification report
    print(f"Classification report for {category_names[i]}:")
    print(report)
    print()