In [73]:
import ast
import numpy as np
import pandas as pd
import random
from skimage import io as skio
from torch.utils.data import Dataset, DataLoader, sampler
from torchvision import transforms as tr
from os.path import join as pjoin
import torch
import torch.nn as nn
from torchvision.models import resnet34
import torch.optim as opt
import tqdm

In [74]:
movies = pd.read_csv('movies/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


Remove `/` prefix from poster path for easier join with local listed files

In [75]:
movies['poster_path'] = movies.poster_path.str.lstrip('/')

## Write poster URLs to file

In [76]:
POSTER_SIZES = [
  "w92",
  "w154",
  "w185",
  "w342",
  "w500",
  "w780",
  "original"
]

Poster URL example:`https://image.tmdb.org/t/p/w500/8uO0gUM8aNqYLs1OsTBQiXu0fEv.jpg`

In [79]:
POSTER_URL_BASE = "https://image.tmdb.org/t/p/"
POSTER_DFLT_SIZE = 3
def poster_url(path):
    return f"{POSTER_URL_BASE}{POSTER_SIZES[POSTER_DFLT_SIZE]}/{path}"
poster_url(movies.poster_path.iloc[0])

'https://image.tmdb.org/t/p/w342/rhIRbceoE9lR4veEXuwCC2wARtG.jpg'

Write poster URLs to files for fetching

In [81]:
poster_urls = movies.poster_path.apply(poster_url)
poster_urls.to_csv('poster_urls.txt', header=False, index=False)

In [82]:
!head poster_urls.txt

https://image.tmdb.org/t/p/w342/rhIRbceoE9lR4veEXuwCC2wARtG.jpg
https://image.tmdb.org/t/p/w342/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
https://image.tmdb.org/t/p/w342/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg
https://image.tmdb.org/t/p/w342/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg
https://image.tmdb.org/t/p/w342/e64sOI48hQXyru7naBFyssKFxVd.jpg
https://image.tmdb.org/t/p/w342/zMyfPUelumio3tiDKPffaUpsQTD.jpg
https://image.tmdb.org/t/p/w342/jQh15y5YB7bWz1NtffNZmRw0s9D.jpg
https://image.tmdb.org/t/p/w342/sGO5Qa55p7wTu7FJcX4H4xIVKvS.jpg
https://image.tmdb.org/t/p/w342/eoWvKD60lT95Ss1MYNgVExpo5iU.jpg
https://image.tmdb.org/t/p/w342/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg


In [83]:
!wc -l poster_urls.txt

45466 poster_urls.txt


Run this command to actually fetch the files:

```sh
aria2c -i poster_urls.txt -j 16 -d movies/posters/<width>
```

## Movies from top genres

In [4]:
movies['genres_list'] = \
    movies.genres.apply(ast.literal_eval).apply(lambda l: list(map(lambda x: x['name'], l)))
genres = movies.genres_list.apply(pd.Series).stack()

In [5]:
genre_counts = genres.value_counts()
genre_counts = genre_counts[genre_counts > 1]
genre_counts

Drama              20265
Comedy             13182
Thriller            7624
Romance             6735
Action              6596
Horror              4673
Crime               4307
Documentary         3932
Adventure           3496
Science Fiction     3049
Family              2770
Mystery             2467
Fantasy             2313
Animation           1935
Foreign             1622
Music               1598
History             1398
War                 1323
Western             1042
TV Movie             767
dtype: int64

In [6]:
NUM_GENRES = 6
top_genres = genre_counts.head(NUM_GENRES).index
top_genres

Index(['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Horror'], dtype='object')

In [7]:
movies['labels'] = movies.genres_list.apply(lambda l: [(0, 1)[g in l] for g in top_genres])
movies_with_top_genres_idx = genres[genres.isin(top_genres)].index.droplevel(1).unique()
movies_with_top_genres = movies.loc[movies_with_top_genres_idx]

In [8]:
len(movies_with_top_genres)

36886

## Join with available posters

In [9]:
poster_base = 'movies/posters/w342/'

In [19]:
downloaded_posters = !ls {poster_base}
len(downloaded_posters)

45077

In [20]:
movies_with_top_genres_and_poster = \
  movies_with_top_genres[movies_with_top_genres.poster_path.isin(set(downloaded_posters))]
len(movies_with_top_genres_and_poster)

36788

In [59]:
f = tr.Compose([tr.ToPILImage(), tr.ToTensor()])
invalid = []
alphas = []
for x in tqdm.tqdm_notebook(movies_with_top_genres_and_poster.poster_path):
    try:
        if f(skio.imread(f"{poster_base}{x}")).shape[0] > 3:
            alphas.append(x)
    except:
        invalid.append(x)
len(invalid)

HBox(children=(IntProgress(value=0, max=36788), HTML(value='')))

6

In [60]:
alphas

['1VzCr94M1Zz0DfhU2I1tpZVgsVb.png']

In [22]:
invalid

['oFWvF7OJfT2ydAAatlnsgChV4FP.jpg',
 'vPwWgX1SzLRm6Tqhj9102Mg5kkz.jpg',
 '5eZgNYn8oHdJlBhBuF7fpDoe13R.jpg',
 '3rXML2PCnZOYialJj7GGM1pXlIt.jpg',
 'zhJAHdhjSazDiOrVwOgfhHCLqI8.jpg',
 'k7f6FhWeAZNxlhwVt5kDrVGc9hP.jpg']

In [62]:
invalid += alphas
invalid_idx = movies_with_top_genres_and_poster.poster_path.isin(invalid)
X = movies_with_top_genres_and_poster[~invalid_idx][['poster_path', 'labels']]
X.head()

Unnamed: 0,poster_path,labels
0,rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[0, 1, 0, 0, 0, 0]"
2,6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[0, 1, 0, 1, 0, 0]"
3,16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"[1, 1, 0, 1, 0, 0]"
4,e64sOI48hQXyru7naBFyssKFxVd.jpg,"[0, 1, 0, 0, 0, 0]"
5,zMyfPUelumio3tiDKPffaUpsQTD.jpg,"[1, 0, 1, 0, 1, 0]"


## Data preps

In [63]:
class MoviePostersDataset(Dataset):
    def __init__(self, df, base_dir, transform):
        self.df = df
        self.base_dir = base_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        name = self.df.iloc[idx].poster_path
        path = pjoin(self.base_dir, name)
        img = skio.imread(path)
        lbl = self.df.iloc[idx].labels
        if self.transform:
            img = self.transform(img)
        return img, lbl

### Train-Test Split

In [64]:
VALID_RATE = 0.1

In [65]:
num_samples = len(X)
indices = list(range(num_samples))
valid_idx = random.choices(indices, k=int(num_samples*VALID_RATE))
train_idx = list(set(indices) - set(valid_idx))
valid_samp = sampler.SubsetRandomSampler(valid_idx)
train_samp = sampler.SubsetRandomSampler(train_idx)

In [66]:
valid_trans = tr.Compose([
    tr.ToPILImage(),
    tr.CenterCrop(224),
    tr.ToTensor(),
    tr.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
])

In [67]:
train_trans = tr.Compose([
    tr.ToPILImage(),
    tr.CenterCrop(224),
    # tr.RandomVerticalFlip(),
    tr.ToTensor(),
    tr.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
])

In [68]:
valid_ds = MoviePostersDataset(X.iloc[valid_idx], poster_base, valid_trans)
train_ds = MoviePostersDataset(X.iloc[train_idx], poster_base, train_trans)
#valid_ld = DataLoader(valid_ds, sampler=valid_samp, batch_size=16, num_workers=2, drop_last=True)
#train_ld = DataLoader(train_ds, sampler=train_samp, batch_size=36, num_workers=4, drop_last=True)
valid_ld = DataLoader(valid_ds, shuffle=True, batch_size=16, num_workers=2, drop_last=True)
train_ld = DataLoader(train_ds, shuffle=True, batch_size=36, num_workers=4, drop_last=True)
(len(train_ld), len(valid_ld))

(924, 229)

## Modeling

In [69]:
net = resnet34(pretrained=True)
layers = list(net.children())
last_layer = layers[-1]
last_layer.in_features, last_layer.out_features
for p in net.parameters(): p.requires_grad = False
net.fc = nn.Linear(in_features=last_layer.in_features, out_features=NUM_GENRES)

In [70]:
criterion = nn.BCEWithLogitsLoss()
optimizer = opt.SGD(net.fc.parameters(), lr=0.01)

### Train!

In [71]:
net = net.cuda()

In [72]:
for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    net.train()
    for data in tqdm.tqdm_notebook(train_ld):
        inputs, labels = data
        labels = torch.stack(labels).transpose(1,0).float()
        inputs, labels = inputs.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(running_loss)

HBox(children=(IntProgress(value=0, max=924), HTML(value='')))

464.7597334086895


HBox(children=(IntProgress(value=0, max=924), HTML(value='')))

434.96938905119896
