<a href="https://colab.research.google.com/github/chooj202/movie_genre_prediction/blob/master/notebooks/howard_stacking_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Up and Load Raw Data

In [29]:
!pip install -q scikit-multilearn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/89.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import pandas as pd
import numpy as np

## Load Zipped Raw Data and Preprocessed Image Data From Google Drive

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import zipfile
import io

ZIP_FILE_PATH = "drive/MyDrive/personal_projects/movie_genre_prediction/clean_data.zip"
zf = zipfile.ZipFile(ZIP_FILE_PATH, "r")
zf.extractall()

In [13]:
big_data_df = pd.read_csv('raw_data/clean_data/clean_big_data.csv').drop(columns=['Unnamed: 0'])
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"['Adventure', 'Comedy', 'Fantasy']",barbie suffers a crisis that leads her to ques...


In [6]:
image_array = np.load('raw_data/clean_data/clean_image_array.npy')
image_array.shape

(23140, 256, 256, 3)

In [8]:
# Double check if our image_array and big_data_df are the same size
assert big_data_df.shape[0] == image_array.shape[0]

## Preprocess Genres and Split to Train, Test, Validation Datasets

In [14]:
def preprocess_genre(df: pd.DataFrame):
    df["genre"] = df["genre"].apply(eval).apply(lambda x: [genre.strip() for genre in x])
    return df

In [15]:
big_data_df = preprocess_genre(big_data_df)
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...


In [18]:
from sklearn.preprocessing import MultiLabelBinarizer

# Multilabel_binarizer is fit to an array of list of labels
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(big_data_df['genre'])

#transform target variable
y = multilabel_binarizer.transform(big_data_df['genre'])
genre_names = multilabel_binarizer.classes_

# Adding the name of genres
for i in range(len(genre_names)):
    big_data_df[f"{genre_names[i]}"] = y[:,i]

print(y.shape, big_data_df.shape)

(23140, 22) (23140, 25)


In [19]:
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,...,Music,Musical,Mystery,Reality-TV,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
from skmultilearn.model_selection import iterative_train_test_split

def balanced_split(df: pd.DataFrame, labels: np.array, image_array: np.array, test_size=0.5):
    """
    index_array: an array of our current df index,
    iterative_train_test_split: a function that considers the distribution of possible labels when splitting
    """
    index_array = np.expand_dims(np.arange(len(df)), axis=1)
    train_index, y_train, test_index, y_test = iterative_train_test_split(index_array, labels, test_size)
    train_image_array, test_image_array = np.take(image_array, train_index.ravel(), axis=0), np.take(image_array, test_index.ravel(), axis=0)
    return df.iloc[train_index[:, 0]], train_image_array, y_train, df.iloc[test_index[:, 0]], test_image_array, y_test

In [63]:
# Split dataset to train and test_val (will split test and val again!)
train_df, train_image_array, y_train, test_val_df, test_val_image_array, y_test_val = balanced_split(big_data_df, y, image_array, 0.3)
print(train_df.shape, y_train.shape, train_image_array.shape, test_val_df.shape, test_val_image_array.shape, y_test_val.shape)

(16274, 25) (16274, 22) (16274, 256, 256, 3) (6866, 25) (6866, 256, 256, 3) (6866, 22)


In [64]:
# Split test_val further to test and val datasets!
test_df, test_image_array, y_test, val_df, val_image_array, y_val = balanced_split(test_val_df, y_test_val, test_val_image_array)
print(test_df.shape, test_image_array.shape, y_test.shape, val_df.shape, val_image_array.shape, y_val.shape)

(3433, 25) (3433, 256, 256, 3) (3433, 22) (3433, 25) (3433, 256, 256, 3) (3433, 22)


**RECAP**

Let's recap what we have up to this point!
Balanced train, test, validation datasets consisting of:
1. Dataframes with imdb_id and plot (train_df, test_df, val_df)
2. Preprocessed image_arrays (train_image_array, test_image_array and val_image_array)
3. Labels! (y_train, y_test, y_val)

# Data Preprocessing