# Movie Score Predictions

## Training Script

The purpose of this script is to tie together the input generator and neural model scripts to train a neural network.

### Import Scripts and Libraries:

In [1]:
# Import the desired functions from the input generator and neural model scripts:
from imdb_input_generator import *
from imdb_neural_model import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
# Import other required libraries:
import pickle as pkl
import h5py
import os
import pandas as pd
import pdb

In [None]:
# Import Keras:
import keras

### Function Definitions: 

In [None]:
# Function for checking if weight folder exists:
def weight_folder_check(subject, mainfolder, subfolders):
    if not os.path.exists(mainfolder):
        print 'Creating folders for %s weights...' % (subject)
        os.mkdir(mainfolder)
        for folder in subfolders:
            print 'Creating folder:', folder
            os.mkdir(folder)
    else:
        print 'Folders for %s training weights already exist' % (subject)

In [None]:
# Function for checking the latest training epoch number:
def epoch_check(count, path, name_template, ext):
    for epoch in os.listdir(path):
        if len(epoch.split(name_template)) > 1:
            print 'Epoch name:', epoch
            e = epoch.split(name_template)
            e = e[1].split(ext)
            e = int(e[0])
            if e > count:
                count = e
    return count

In [None]:
# Function for checking latest epoch number for multiple training sets:
def epoch_mult_check(folders, name_template, name_ext):
    ep_counts = []
    counter = -1
    for idx, path in enumerate(folders):
        train_idx = 't%s_' % (idx + 1)
        weight_name_mod = name_template + train_idx
        if os.listdir(path) != []:
            counter = epoch_check(counter, path, weight_name_mod, name_ext)
            print 'Most up-to-date auto-encoder weight file (epoch) for training set %s is indexed: %s' % (idx+1, 
                                                                                                           counter)
            ep_counts.append(counter)
        else:
            print 'No weight files have been generated yet for training set %s' % (idx+1)
            ep_counts.append(counter)
    return ep_counts

In [None]:
# Function to generate autoencoder weights:
def autoencoder_weight_generate(tpaths, name_template, epoch_counts, input_vector_path, model_len, ae_folders, 
                                weight_name_ext):
    # Specify maximum number of epochs to process:
    epoch_num = 3
    for idx, path in enumerate(tpaths):
        print 'Training auto-encoder weights on training set %s' % (idx + 1)
        train_idx = 't%s_' % (idx + 1)
        weight_name_mod = name_template + train_idx
        for num in range(epoch_counts[idx] + 1, epoch_num):
            print 'Auto-encoder weight generating on epoch', num
            # nb_epoch specifies how many epochs run before saving
            # samples_per_epoch specifies # of times to call generator
            autoencoder.fit_generator(autoencoder_generator(path, input_vector_path), 
                                      samples_per_epoch=model_len, nb_epoch=1)
            fresh_weight_name = ae_folders[idx] + weight_name_mod + str(num) + weight_name_ext
            autoencoder.save_weights(fresh_weight_name)
        print 'Weight generation for auto-encoder complete on training set %s' % (idx)

### Defining Folder and File Structure:

**IMPORTANT: Please remember to update this below cell to whatever main project directory / training data directory structure you've chosen**

In [None]:
# Specify directory containing data files:
parent_path = '/Users/cheng-haotai/Documents/Projects_Data/IMDb_Predictor/'
data_name = 'training_data/'
data_path = parent_path + data_name

The cells below specify file names and directory structures in relation to the parent path defined above. They do not need to be altered.

In [None]:
# Specify training data files that have been saved:
train1_name = 'train1_index.pickle'
train2_name = 'train2_index.pickle'
train3_name = 'train3_index.pickle'
train1_path = data_path + train1_name
train2_path = data_path + train2_name
train3_path = data_path + train3_name
tpaths = [train1_path, train2_path, train3_path]

train1_folder = 'train1/'
train2_folder = 'train2/'
train3_folder = 'train3/'

In [None]:
# Specify input data file that has been saved:
input_vector_name = 'input_vectors.h5'
input_vector_path = data_path + input_vector_name

In [None]:
# Specify output data file that has been saved:
output_vector_name = 'output_data.pickle'
output_vector_path = data_path + output_vector_name

In [None]:
# Specify test index data files that have been saved:
test_name = 'test_index.pickle'
test_name_path = data_path + test_name

In [None]:
# Prepare inputs for imdb_regression model:
transformed_data_folder = 'transformed_data/'
transformed_data_t1 = 'transformed_data_t1.h5'
transformed_data_t2 = 'transformed_data_t2.h5'
transformed_data_t3 = 'transformed_data_t3.h5'
transdata_folder = data_path + transformed_data_folder

transdata_path1 = transdata_folder + transformed_data_t1
transdata_path2 = transdata_folder + transformed_data_t2
transdata_path3 = transdata_folder + transformed_data_t3
trans_paths = [transdata_path1, transdata_path2, transdata_path3]

transdata1 = 'trans_data_1'
transdata2 = 'trans_data_2'
transdata3 = 'trans_data_3'
transformed_data_dict = [transdata1, transdata2, transdata3]

In [None]:
# Specify save parameters for predictions:
prediction_data_name = 'score_predictions.csv'
prediction_data_path = data_path + prediction_data_name

### Training Models and Running Predictions:

The following code will be partitioned by function. These functions can be turned on / off by modifying the cell below to "1" or "0".

In [None]:
# Setting flags to turn on/off training segments 
regression = 1
ae = 1
test_ae = 1
test_regression = 0

#### Auto-Encoder Portion:

In [None]:
# Specify training weight file details for auto-encoder:
weight_folder = data_path + 'autoen_training_weights/'
weight_name_template = 'autoen_weights'
weight_name_ext = '.h5'
ae_train1_folder = weight_folder + train1_folder
ae_train2_folder = weight_folder + train2_folder
ae_train3_folder = weight_folder + train3_folder
ae_folders = [ae_train1_folder, ae_train2_folder, ae_train3_folder]

In [None]:
# Get size of neural network input:
with h5py.File(input_vector_path, 'r') as input_file:
    input_data = input_file['input_dataset'][:]
model_len = len(input_data)
row_len = len(input_data[0])
model_size = (model_len,)  # Tuple of size 1
row_size = (row_len,)

In [None]:
# Check if weight folder exists and create if it doesn't:
weight_folder_check('auto-encoder', weight_folder, ae_folders)

In [None]:
# Check if reduced-dimension dataset folder exists and create if it doesn't:
if not os.path.exists(transdata_folder):
    print 'Creating folder for reduced-dimensionality datasets...'
    os.mkdir(transdata_folder)
else:
    print 'Folder for reduced-dimensionality datasets already exist'

In [None]:
# Check how many epochs have been processed:
# Counter = -1 if no weights have been generated yet
epoch_counts = epoch_mult_check(ae_folders, weight_name_template, weight_name_ext)
print epoch_counts

In [None]:
# Create a model object of the auto-encoder:
print 'Instantiating model for encoder and auto-encoder'
encoder, autoencoder = auto_encoder(row_size)

The preceeding couple of cells under the auto-encoder portion have created the necessary folder structures and collected the relevant information necessary to train the auto-encoder model and reduce dataset dimensionality. Now, the auto-encoder training will begin. Once weights from auto-encoder training have been generated, the most recent weight file will be loaded into the encoder model for dimensionality reduction.

In [None]:
if test_ae == 1:
    print 'Generating auto-encoder weights...'
    # Specify maximum number of epochs to process:
    epoch_num = 3
    for idx, path in enumerate(tpaths):
        print 'Training auto-encoder weights on training set %s' % (idx + 1)
        train_idx = 't%s_' % (idx + 1)
        weight_name_mod = weight_name_template + train_idx
        for num in range(epoch_counts[idx] + 1, epoch_num):
            print 'Auto-encoder weight generating on epoch', num
            # nb_epoch specifies how many epochs run before saving
            # samples_per_epoch specifies # of times to call generator
            autoencoder.fit_generator(autoencoder_generator(path, input_vector_path), 
                                      samples_per_epoch=model_len, nb_epoch=1)
            fresh_weight_name = ae_folders[idx] + weight_name_mod + str(num) + weight_name_ext
            autoencoder.save_weights(fresh_weight_name)
        print 'Weight generation for auto-encoder complete on training set %s' % (idx)
#     autoencoder_weight_generate(tpaths, weight_name_template, epoch_counts, input_vector_path, model_len, 
#                                     ae_folders, weight_name_ext)
else:
    print 'No further auto-encoder weight generation required'

In [None]:
if ae == 1:
    for idx, folder in enumerate(ae_folders):
        train_idx = 't%s_' % (idx + 1)
        weight_name_mod = weight_name_template + train_idx
        # Loading weights trained from autoencoder | Encoder can "see" weights because by_name = True
        print 'Loading latest weight file for training set %s into encoder model...' % (idx + 1)
        latest_weight = folder + weight_name_mod + str(epoch_counts[idx]) + weight_name_ext
        encoder.load_weights(latest_weight, by_name=True)

        # Use auto_encoder to encode data into small dimension (utilizing encoder layer):
        if not os.path.exists(trans_paths[idx]):
            print('Transforming input data into lower dimensionality...')
            transformed_data = []
            for row in input_data:
                placeholder_input = np.zeros((1, row_len))  # 22000 size placeholder
                placeholder_input[0] = row
                # Gives result of encoder layer in auto_encoder function
                en_predict = encoder.predict(placeholder_input)[0]  # list with a list
                transformed_data.append(en_predict)
            # Save transformed inputs into h5 file:
            print 'Saving transformed data into h5py format...'
            transformed_data_file = h5py.File(trans_paths[idx], 'w')
            transformed_data_file.create_dataset(transformed_data_dict[idx], data=transformed_data)
            print 'Transformed data has been successfully saved for training set %s!' % (idx + 1)
        else:
            print 'Data has already been transformed in dimensionality for training set %s!' % (idx + 1)

#### Regression Portion:

In [None]:
# Specify training weight file details for imdb regression model:
imdb_weight_folder = data_path + 'imdb_training_data/'
imdb_weight_name_template = 'imdb_weights'
imdb_weight_name_ext = '.h5'
imdb_train1_folder = imdb_weight_folder + train1_folder
imdb_train2_folder = imdb_weight_folder + train2_folder
imdb_train3_folder = imdb_weight_folder + train3_folder
imdb_folders = [imdb_train1_folder, imdb_train2_folder, imdb_train3_folder]

In [None]:
# List out paths of transformed data:
trans_paths

In [None]:
# Get size of transformed data:
print 'Loading transformed data...'
loaded_transdata = []
for idx, path in enumerate(trans_paths):
    with h5py.File(path, 'r') as load_transformed:
            loaded_transdata.append(load_transformed[transformed_data_dict[idx]][:])
# Initialize size values as zeros:
transformed_len = 0
trans_row_len = 0
trans_row_size = 0
for data in loaded_transdata:
    transformed_len += len(data)
    trans_row_len += len(data[0])
# Get 'averaged' data size:
transformed_len = transformed_len / len(loaded_transdata)
trans_row_len = trans_row_len / len(loaded_transdata)
trans_row_size = (trans_row_len,)  # Length 1 tuple (10000 value)

In [None]:
# Check if weight folder exists and create if it doesn't:
weight_folder_check('IMDb regression', imdb_weight_folder, imdb_folders)

In [None]:
# Check how many epochs have been processed:
# Counter = -1 if no weights have been generated yet
imdb_epoch_counts = epoch_mult_check(imdb_folders, imdb_weight_name_template, weight_name_ext)
print imdb_epoch_counts

In [None]:
if regression == 1:
    
    # Create model object of imdb regression model:
    imdb_reg = imdb_regression(trans_row_size)
    
    # Define name of imdb weight file:
    imdb_weight_file_name = imdb_weight_folder + imdb_weight_name_template + str(imdb_counter) + imdb_weight_name_ext
    # Load imdb weight file if it exists:
    if os.path.exists(imdb_weight_file_name):
        print 'Latest imdb weight file already exists! Loading...'
        # by_name allows for old weight files to be used with new models with new structures
        imdb_reg.load_weights(imdb_weight_file_name, by_name=True)
    else:
        print 'No imdb weights have been generated yet'
    
    if test_regression == 1:
        # Specify maximum number of epochs to process in imdb weight training:
        imdb_epoch_num = 1000
        for num in range(imdb_counter + 1, imdb_epoch_num):
            print('IMDb regression weight training on epoch:', num)
            imdb_reg.fit_generator(input_generator(train1_path, output_vector_path, transformed_data_path),
                                   samples_per_epoch = transformed_len, nb_epoch=1)
            fresh_imdb_weight_name = imdb_weight_folder + imdb_weight_name_template + str(num) + imdb_weight_name_ext
            imdb_reg.save_weights(fresh_imdb_weight_name)
            
    else:
        # Load latest weights for imdb model:
        imdb_latest_weight = imdb_weight_folder + imdb_weight_name_template + str(imdb_counter) + imdb_weight_name_ext
        imdb_reg.load_weights(imdb_latest_weight, by_name=True)

        # Run predictions on test data:
        # Load test index file and output data file:
        with open(test_name_path, 'rb') as test_set:
            test_data = pkl.load(test_set)
        with open(output_vector_path, 'rb') as output_set:
            output_data = pkl.load(output_set)

        # Check whether or not predictions have been run:
        if not os.path.exists(prediction_data_path):
            print('Running predictions on test data...')
            prediction_vec = []
            score_vec = []
            # Train on test data:
            for index in test_data:
                # Pull necessary data using index in test data:
                test_row = transformed_data[index]
                score_val = output_data[index]
                # Populate placeholder vector:
                placeholder_input = np.zeros((1, trans_row_len))  # 10000 placeholder
                placeholder_input[0] = test_row
                imdb_predict = imdb_reg.predict(placeholder_input)
                prediction_vec.append(imdb_predict)
                score_vec.append(score_val)
            
            print 'Predictions have completed! Proceeding to save data...'
            final_results = pd.DataFrame()
            final_results['Real_Score'] = score_vec
            final_results['Predicted_Score'] = prediction_vec
            final_results.to_csv(prediction_data_path)
            print 'Prediction data has been saved!'
        else:
            print 'Predictions for IMDb scores have already been run!'
            final_results = pd.read_csv(prediction_data_path)