In [2]:
import ast
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [3]:
# There are multiple genres per movie
df = pd.read_csv("dataset/movies_metadata.csv",usecols=['genres', 'overview'])
# Convert genres string type to list
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: [d['name'] for d in x])
df.head()

Unnamed: 0,genres,overview
0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ..."
1,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...
2,"[Romance, Comedy]",A family wedding reignites the ancient feud be...
3,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom..."
4,[Comedy],Just when George Banks has recovered from his ...


In [4]:
# Some movies have no overview, remove them
df['overview'].isnull().sum()

954

In [5]:
# Some movies have no labels, remove them
df['genres'].apply(lambda x: len(x)).describe()

count    45466.000000
mean         2.003827
std          1.131100
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          8.000000
Name: genres, dtype: float64

In [6]:
# Analyze labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['genres'])
classes = mlb.classes_
label_df = pd.DataFrame(labels, columns=classes)

# Some labels appear only once and are clearly not movie genres, remove them
labels_count = label_df.sum().sort_values(ascending=False)
labels_count

Drama                                    20265
Comedy                                   13182
Thriller                                  7624
Romance                                   6735
Action                                    6596
Horror                                    4673
Crime                                     4307
Documentary                               3932
Adventure                                 3496
Science Fiction                           3049
Family                                    2770
Mystery                                   2467
Fantasy                                   2313
Animation                                 1935
Foreign                                   1622
Music                                     1598
History                                   1398
War                                       1323
Western                                   1042
TV Movie                                   767
Odyssey Media                                1
Pulser Produc

In [7]:
filtered_labels = labels_count[labels_count != 1].index.to_list()
filtered_labels

['Drama',
 'Comedy',
 'Thriller',
 'Romance',
 'Action',
 'Horror',
 'Crime',
 'Documentary',
 'Adventure',
 'Science Fiction',
 'Family',
 'Mystery',
 'Fantasy',
 'Animation',
 'Foreign',
 'Music',
 'History',
 'War',
 'Western',
 'TV Movie']

In [8]:
mlb = MultiLabelBinarizer(classes=filtered_labels)
labels = mlb.fit_transform(df['genres'])
classes = mlb.classes_



In [9]:
# drop entries with no labels, entries with labels that we chose to discard and entries with no overview
binarized_df = pd.DataFrame(labels, columns=classes)
binarized_df['overview'] = df['overview']
binarized_df = binarized_df[binarized_df.loc[:, binarized_df.columns != 'overview'].sum(axis=1) != 0].dropna(subset = ['overview']).reset_index(drop=True)
binarized_df = binarized_df.dropna(subset = ['overview']).reset_index(drop=True)

X, y = binarized_df['overview'], binarized_df.loc[:, binarized_df.columns != 'overview']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=3)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 1000, stop_words = 'english', lowercase = True)
X_train_vec = vectorizer.fit_transform(X_train)
X_valid_vec = vectorizer.transform(X_valid)

In [11]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train_vec, y_train)

pred_train, pred_val = model.predict(X_train_vec), model.predict(X_valid_vec)

from sklearn.metrics import classification_report, accuracy_score
print("Classification Report")
print("Training")
print(classification_report(y_true = y_train, y_pred = pred_train, target_names = classes))
print("Accuracy ", accuracy_score(y_true = y_train, y_pred = pred_train))
print()
print("Validation")
print(classification_report(y_true = y_valid, y_pred = pred_val, target_names = classes))
print("Accuracy ", accuracy_score(y_true = y_valid, y_pred = pred_val))

Classification Report
Training
                 precision    recall  f1-score   support

          Drama       1.00      0.99      1.00     14014
         Comedy       0.99      1.00      1.00      8955
       Thriller       1.00      1.00      1.00      5338
        Romance       1.00      1.00      1.00      4640
         Action       1.00      1.00      1.00      4687
         Horror       1.00      1.00      1.00      3242
          Crime       1.00      1.00      1.00      3031
    Documentary       1.00      1.00      1.00      2732
      Adventure       1.00      1.00      1.00      2472
Science Fiction       1.00      1.00      1.00      2105
         Family       1.00      0.99      1.00      1893
        Mystery       1.00      1.00      1.00      1719
        Fantasy       1.00      0.99      1.00      1577
      Animation       1.00      0.99      0.99      1300
        Foreign       1.00      0.99      0.99      1125
          Music       1.00      0.99      1.00      1075

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_vec, y_train)

pred_train, pred_val = model.predict(X_train_vec), model.predict(X_valid_vec)

from sklearn.metrics import classification_report, accuracy_score
print("Classification Report")
print("Training")
print(classification_report(y_true = y_train, y_pred = pred_train, target_names = classes))
print("Accuracy ", accuracy_score(y_true = y_train, y_pred = pred_train))
print()
print("Validation")
print(classification_report(y_true = y_valid, y_pred = pred_val, target_names = classes))
print("Accuracy ", accuracy_score(y_true = y_valid, y_pred = pred_val))

Classification Report
Training
                 precision    recall  f1-score   support

          Drama       1.00      1.00      1.00     14014
         Comedy       0.99      1.00      1.00      8955
       Thriller       1.00      1.00      1.00      5338
        Romance       1.00      0.99      1.00      4640
         Action       1.00      1.00      1.00      4687
         Horror       1.00      1.00      1.00      3242
          Crime       1.00      1.00      1.00      3031
    Documentary       1.00      1.00      1.00      2732
      Adventure       1.00      1.00      1.00      2472
Science Fiction       1.00      1.00      1.00      2105
         Family       1.00      0.99      0.99      1893
        Mystery       1.00      1.00      1.00      1719
        Fantasy       1.00      0.99      1.00      1577
      Animation       1.00      0.99      0.99      1300
        Foreign       1.00      0.98      0.99      1125
          Music       1.00      0.99      1.00      1075

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(verbose = True, 
                      early_stopping=True,
                      validation_fraction=0.3,
                      max_iter = 150, 
                      hidden_layer_sizes=(100))
model.fit(X_train_vec, y_train)

pred_train, pred_val = model.predict(X_train_vec), model.predict(X_valid_vec)

from sklearn.metrics import classification_report, accuracy_score
print("Classification Report")
print("Training")
print(classification_report(y_true = y_train, y_pred = pred_train, target_names = classes))
print("Accuracy ", accuracy_score(y_true = y_train, y_pred = pred_train))
print()
print("Validation")
print(classification_report(y_true = y_valid, y_pred = pred_val, target_names = classes))
print("Accuracy ", accuracy_score(y_true = y_valid, y_pred = pred_val))

Iteration 1, loss = 9.31938872
Validation score: 0.019689
Iteration 2, loss = 5.81794817
Validation score: 0.085284
Iteration 3, loss = 5.43152859
Validation score: 0.089671
Iteration 4, loss = 5.04831381
Validation score: 0.118587
Iteration 5, loss = 4.76907115
Validation score: 0.141427
Iteration 6, loss = 4.58432986
Validation score: 0.153578
Iteration 7, loss = 4.45083366
Validation score: 0.162466
Iteration 8, loss = 4.35046803
Validation score: 0.165729
Iteration 9, loss = 4.27471528
Validation score: 0.168317
Iteration 10, loss = 4.21601015
Validation score: 0.170680
Iteration 11, loss = 4.16871617
Validation score: 0.171242
Iteration 12, loss = 4.12843861
Validation score: 0.171692
Iteration 13, loss = 4.09348575
Validation score: 0.172255
Iteration 14, loss = 4.06335705
Validation score: 0.173830
Iteration 15, loss = 4.03589272
Validation score: 0.174392
Iteration 16, loss = 4.01059094
Validation score: 0.174280
Iteration 17, loss = 3.98746658
Validation score: 0.173717
Iterat

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
vect=Tokenizer()
vect.fit_on_texts(X_train)
vocab_size = len(vect.word_index) + 1
print(vocab_size)

69584


In [15]:
max_len = X_train.apply(lambda x: len(x.split())).max()

In [16]:
encoded_docs_train = vect.texts_to_sequences(X_train)
max_length = vocab_size
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_len, padding='post')
print(padded_docs_train)

[[16253     2    41 ...     0     0     0]
 [    2  9166   868 ...     0     0     0]
 [ 2557    67  4529 ...     0     0     0]
 ...
 [   17     1  1445 ...     0     0     0]
 [12652  2593  9866 ...     0     0     0]
 [    2  1889     5 ...     0     0     0]]


In [20]:
padded_docs_train.max()

69583

In [19]:
X_train

17984    NELU, a man in his forties, works as a securit...
25661    A sociopathic stranger all but destroys a smal...
6505     Julia finds 300 million pesetas hidden in a de...
18271    1981, Morocco. A village in the Atlas mountain...
20623    After losing contact with Earth, Astronaut Lee...
                               ...                        
39168    As a valiant Chinese general and his men battl...
25544    Four young outsiders teleport to a dangerous u...
11513    As the kingdom of Devigarh comes apart at the ...
1688     Dimwitted, somewhat misanthropic Oslo mail car...
5994     A collection of magical tales based upon the a...
Name: overview, Length: 29626, dtype: object

In [17]:
encoded_docs_test =  vect.texts_to_sequences(X_valid)
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_len, padding='post')
encoded_docs_cv = vect.texts_to_sequences(X_valid)
padded_docs_cv = pad_sequences(encoded_docs_cv, maxlen=max_len, padding='post')

In [357]:
from tensorflow.keras.layers import LSTM

model = Sequential()
# Configuring the parameters
model.add(Embedding(vocab_size, output_dim=50, input_length=max_len))
model.add(LSTM(128, return_sequences=True))  
# Adding a dropout layer
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
# Adding a dense output layer with sigmoid activation
model.add(Dense(len(classes), activation='sigmoid'))
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 187, 50)           3479200   
                                                                 
 lstm_6 (LSTM)               (None, 187, 128)          91648     
                                                                 
 dropout_7 (Dropout)         (None, 187, 128)          0         
                                                                 
 lstm_7 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 20)                1300      
                                                                 
Total params: 3621556 (13.82 MB)
Trainable params: 362

In [358]:
model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit(padded_docs_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=[])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [362]:
predictions = model.predict([padded_docs_test])



In [364]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for val in thresholds:
    pred=predictions.copy()
  
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    # print("Micro-average quality numbers")
    print("Threshold: {:.4f}, Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(val, accuracy, precision, recall, f1))

Threshold: 0.1000, Accuracy: 0.0000, Precision: 0.2289, Recall: 0.6469, F1-measure: 0.3381
Threshold: 0.2000, Accuracy: 0.0365, Precision: 0.3883, Recall: 0.3658, F1-measure: 0.3767
Threshold: 0.3000, Accuracy: 0.0365, Precision: 0.3883, Recall: 0.3658, F1-measure: 0.3767
Threshold: 0.4000, Accuracy: 0.1179, Precision: 0.4732, Recall: 0.2229, F1-measure: 0.3031


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Threshold: 0.5000, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1-measure: 0.0000
Threshold: 0.6000, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1-measure: 0.0000
Threshold: 0.7000, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1-measure: 0.0000
Threshold: 0.8000, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1-measure: 0.0000
Threshold: 0.9000, Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1-measure: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))


In [403]:
from tensorflow.keras.layers import Flatten

model = Sequential()
model.add(Embedding(vocab_size, output_dim=50, input_length=max_len))
model.add(Conv1D(64, 3, activation='sigmoid'))
# model.add(Conv1D(100, 3, activation='sigmoid'))
# model.add(Conv1D(100, 3, activation='sigmoid'))
# model.add(Dropout(0.70))
model.add(Conv1D(48, 3, activation='sigmoid'))
model.add(Flatten())
model.add(Dense(len(classes), activation='sigmoid'))

model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 187, 50)           3479200   
                                                                 
 conv1d_29 (Conv1D)          (None, 185, 64)           9664      
                                                                 
 conv1d_30 (Conv1D)          (None, 183, 48)           9264      
                                                                 
 flatten_6 (Flatten)         (None, 8784)              0         
                                                                 
 dense_11 (Dense)            (None, 20)                175700    
                                                                 
Total params: 3673828 (14.01 MB)
Trainable params: 3673828 (14.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [404]:
history = model.fit(padded_docs_train, y_train,
                        epochs=3,
                        validation_data=(padded_docs_test, y_valid),
                        batch_size=16)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [405]:
predictions = model.predict([padded_docs_test])



In [406]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for val in thresholds:
    pred=predictions.copy()
  
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    # print("Micro-average quality numbers")
    print("Threshold: {:.4f}, Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(val, accuracy, precision, recall, f1))

Threshold: 0.1000, Accuracy: 0.0166, Precision: 0.2782, Recall: 0.6722, F1-measure: 0.3935
Threshold: 0.2000, Accuracy: 0.0343, Precision: 0.3705, Recall: 0.5043, F1-measure: 0.4272
Threshold: 0.3000, Accuracy: 0.0662, Precision: 0.4341, Recall: 0.3795, F1-measure: 0.4050
Threshold: 0.4000, Accuracy: 0.0854, Precision: 0.4903, Recall: 0.2745, F1-measure: 0.3520
Threshold: 0.5000, Accuracy: 0.0777, Precision: 0.5388, Recall: 0.1844, F1-measure: 0.2747
Threshold: 0.6000, Accuracy: 0.0553, Precision: 0.5772, Recall: 0.1106, F1-measure: 0.1857
Threshold: 0.7000, Accuracy: 0.0295, Precision: 0.6157, Recall: 0.0589, F1-measure: 0.1075
Threshold: 0.8000, Accuracy: 0.0108, Precision: 0.6355, Recall: 0.0222, F1-measure: 0.0429
Threshold: 0.9000, Accuracy: 0.0024, Precision: 0.6325, Recall: 0.0039, F1-measure: 0.0077


6.0

In [407]:
pd.read_csv("https://storage.googleapis.com/kaggle-data-sets/3405/6663/compressed/movies_metadata.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230602%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230602T222944Z&X-Goog-Expires=259200&X-Goo")

HTTPError: HTTP Error 400: Bad Request