#### Marc Stamp - Assessment 4 Setup 

In [None]:
from tensorflow.keras.applications import VGG16
import os
import cv2
import keras
import pandas as pd
from sklearn.preprocessing import OneHotEncoder 
import numpy as np
import pickle 
from sklearn.model_selection import train_test_split 
from keras.preprocessing.image import ImageDataGenerator
import keras.applications as ka
from tensorflow.keras.preprocessing import image_dataset_from_directory

In [None]:
height  = 410
width = 280

VGG = VGG16(weights='imagenet', include_top = False, input_shape = (height, width, 3))
VGG.summary()

In [None]:
# apply to all txt files in setup - apply same O-H encoding and then save output as csv 

all_dict = {}

foldername = '../groundtruth/'

for s in os.listdir(foldername):
    #print(s)
    if s in ['1981.txt', '1980.txt']:
        enc = 'utf-8'
    else: 
        enc = 'utf-16'
    #    continue
        
    
    filename = foldername + s
    #print(filename)
    with open(filename, encoding=enc) as fh:
    
        for l in fh: # issue with looping through file

            str_val = l.strip().split(':', 1)

            if (len(str_val) == 1) & (str_val[0] == '{'):
                film_dict = {}
            elif (len(str_val) == 1) & (str_val[0] == '}'):

                #print({film_dict['imdbID'] : film_dict['Genre']})
                all_dict[film_dict['imdbID']] = film_dict['Genre']
            elif (len(str_val) == 2):
                key = str_val[0]
                value = str_val[1]
                film_dict[key.replace("\"", "").strip()] = value.replace("\"", "").replace(",", "").strip()
    

all_genres = pd.DataFrame({'id': [], 'genres': []})
all_genres
for ids, genres in all_dict.items():
    genre_list = genres.split()
    ids_list = [ids] * len(genre_list)
    all_genres = pd.concat([all_genres, pd.DataFrame({'id': ids_list, 'genres': genre_list})])

all_genres.reset_index(drop = True, inplace = True)

onehotencoder = OneHotEncoder() 

ohf = onehotencoder.fit(all_genres[['genres']])
genre_widen = pd.DataFrame(ohf.transform(all_genres[['genres']]).toarray(),columns = ohf.categories_)
genre_widen.drop(['N/A'], axis = 1, inplace = True)

genres_widened = pd.concat([all_genres['id'], genre_widen], axis = 1)
genres_widened = genres_widened.groupby('id', as_index=False).sum()
genres_widened.columns = [(lambda x : x if (x[0] == 'i')  else x[0])(x) for x in genres_widened.columns]

In [None]:
# get order in which to provide labels - alphanumeric order 
rootdir = '../Movie_Poster_Dataset/'

walk_files = []

for i in os.walk(rootdir):
    #print(i)
    walk_files.append(i)
    
order_files = []

for n in range(len(walk_files)):
    if n == 0:
        continue
    
    files = walk_files[n][2]
    order_files += files
    
ids = genres_widened['id']
ids
genre_order_index = [ids[ids == x.replace('.jpg','')].index[0] for x in order_files]
genre_order_index

ordered_array = np.array(genre_widen.iloc[genre_order_index])

In [None]:
img_genre_train = image_dataset_from_directory(directory = rootdir,
                            labels=list(ordered_array),
                            color_mode="rgb",
                            validation_split=0.15,
                            subset = 'training',
                            seed = 1234,
                            batch_size=32,
                            image_size=(height, width),
                            )

img_genre_valid = image_dataset_from_directory(directory = rootdir,
                            labels=list(ordered_array),
                            color_mode="rgb",
                            validation_split=0.15,
                            subset = 'validation',
                            seed = 1234,
                            batch_size=32,
                            image_size=(height, width),
                            )

#### VGG Development

In [None]:
from keras.layers import Activation, Dropout, Dense, Flatten
from keras import Input, Model

inputs = Input(shape=(410,280, 3))

VGG_ = VGG(inputs, training=False)

flat = Flatten()

VGG_ = flat(VGG_)

# first FC layer
d1 = Dense(128, activation="relu")
VGG_ = d1(VGG_)

drp1 = Dropout(0.5)

VGG_ = drp1(VGG_)

# second FC layer
d2 = Dense(128, activation="relu")
VGG_ = d2(VGG_)

drp2 = Dropout(0.5)

VGG_ = drp2(VGG_)

# output layer
output_layer = Dense(27, activation="sigmoid")
output = output_layer(VGG_)

VGG_model = Model(inputs, output)

VGG_model.summary()

In [None]:
# mark some layers as not trainable
training_list = ['input_1','flatten','dense','dropout','dense_1','dropout_1','dense_2']

for l in VGG_model.layers:
    
    if l.name in training_list:
        continue
    else:
        #print(l.name)
        VGG_model.get_layer(l.name).trainable = False

In [None]:
VGG_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

#### Apply Model training

In [None]:
# new method - from data flow
epochs = 5

VGG_model.fit(img_genre_train, epochs=epochs, validation_data=img_genre_valid)