In [12]:
import os
from pathlib import Path
#import tempfile
import cv2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import MiniBatchSparsePCA
from sklearn.cluster import KMeans
from skimage.filters.rank import entropy
from skimage.morphology import disk
from PIL import Image, ImageStat
#from skimage import img_as_float
import pandas as pd
#from multiprocessing import Process
import matplotlib.image as img
from scipy.cluster.vq import whiten
from scipy.cluster.vq import kmeans
import matplotlib.pyplot as plt
import matplotlib.image as img
import statistics
from scipy import spatial


<h1> Imported datasets </h1>

In [13]:
#ratings = pd.read_csv('./ml-20m/ratings.csv')
#movie_list = pd.read_csv('movie_assets_sampled.csv')

# In case I do not have time to run the program
# The outputted features have been added to a CSV file 
#movie_df = pd.read_csv('output.csv')


<h1> Visual features </h1>

In [16]:
# Calculates brightness by splitting HSV color space into 
# hue, saturation, and value. The value is synonymous with brightness.
def get_brightness(img):
    image = img.copy()
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    #cv2.imshow('Image', hsv)
    _, _, v = cv2.split(hsv)
    sum = np.sum(v, dtype=np.float32)
    num_of_pixels = v.shape[0] * v.shape[1]
    return (sum * 100.0) / (num_of_pixels * 255.0)

# Calculates saturation by splitting HSV color space into 
# hue, saturation, and value. Saturation is extracted and represents
# saturation
def get_saturation(img):
    image = img.copy()
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    #cv2.imshow('Image', hsv)
    _, s, _ = cv2.split(hsv)
    sum = np.sum(s, dtype = np.float32)
    num_of_pixels = s.shape[0] * s.shape[1]
    return (sum * 100.0) / (num_of_pixels * 255.0)

# Calculates entropy
def get_entropy(img):
    image = img.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    entropy_img = entropy(gray,disk(5))
    all_sum = np.sum(entropy_img, dtype = np.float32)
    num_of_pixels = entropy_img.shape[0] * entropy_img.shape[1]
    return all_sum / num_of_pixels

# Calculates image sharpness by the variance of the Laplacian
def get_sharpness(img):
    image = img.copy()
    img2gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return cv2.Laplacian(img2gray, cv2.CV_64F).var()

# Return contrast (RMS contrast)
def get_contrast(img):
    image = img.copy()
    img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return img_gray.std()


def get_colorfulness(img):
    image = img.copy()
    # split the image into its respective RGB components
    (B, G, R) = cv2.split(image.astype("float"))
    # compute rg = R - G
    rg = np.absolute(R - G)
    # compute yb = 0.5 * (R + G) - B
    yb = np.absolute(0.5 * (R + G) - B)
    # compute the mean and standard deviation of both `rg` and `yb`
    (rbMean, rbStd) = (np.mean(rg), np.std(rg))
    (ybMean, ybStd) = (np.mean(yb), np.std(yb))
    # combine the mean and standard deviations
    stdRoot = np.sqrt((rbStd ** 2) + (ybStd ** 2))
    meanRoot = np.sqrt((rbMean ** 2) + (ybMean ** 2))
    # derive the "saturation" metric and return it
    return stdRoot + (0.3 * meanRoot)

def get_dominant_color(img):
    image = img.copy()
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
    r = []
    g = []
    b = []
    for row in image:
        for temp_r, temp_g, temp_b in row:
            r.append(temp_r)
            g.append(temp_g)
            b.append(temp_b)
    
    image_df = pd.DataFrame({'red' : r,
                            'green' : g,
                            'blue' : b})
    
    image_df['scaled_color_red'] = whiten(image_df['red'])
    image_df['scaled_color_blue'] = whiten(image_df['blue'])
    image_df['scaled_color_green'] = whiten(image_df['green'])
    
    cluster_centers, _ = kmeans(image_df[['scaled_color_red',
                                        'scaled_color_blue',
                                        'scaled_color_green']], 3)
    
    dominant_colors = []
    
    red_std, green_std, blue_std = image_df[['red',
                                            'green',
                                            'blue']].std()
    
    for cluster_center in cluster_centers:
        red_scaled, green_scaled, blue_scaled = cluster_center
        dominant_colors.append((
            red_scaled * red_std / 255,
            green_scaled * green_std / 255,
            blue_scaled * blue_std / 255
        ))
    return str(dominant_colors)


<h1> Helper functions </h1>

In [17]:
# Averages a list
def average(l):
    return sum(l) / len(l)

# Makes a list of unique values from a list
def unique(list1):
    # Init null list
    unique_list = []

    for x in list1:
        if x not in unique_list:
            unique_list.append(x)
            #print(x)
    return unique_list

<h1> Collecting visual features </h1>

In [18]:
# Adds values for each picture to a list so they can be later averaged and 
# made into a dataframe. 
def get_features(image_folder, movie_id):
    movie_dict = {}
    df = pd.DataFrame()
    saturation_list = []
    brightness_list = []
    entropy_list = []
    sharpness_list = []
    contrast_list = []
    colorfulness_list = []
    dominant_color_list = []
    frame_list = []

    for i in range(len(image_folder)):
        #try:
            img = cv2.imread(os.path.join(folder_path, image_folder[i]))
            frame_brightness = get_brightness(img)
            frame_saturation = get_saturation(img)
            frame_entropy = get_entropy(img)
            frame_sharpness = get_sharpness(img)
            frame_contrast = get_contrast(img)
            frame_colorfulness = get_colorfulness(img)
            frame_domcolor = get_dominant_color(img)

            saturation_list.append(frame_saturation)
            brightness_list.append(frame_brightness)
            entropy_list.append(frame_entropy)
            sharpness_list.append(frame_sharpness)
            contrast_list.append(frame_contrast)
            colorfulness_list.append(frame_colorfulness)
            dominant_color_list.append(frame_domcolor)
            frame_list.append(image_folder[i][:-4])
        #except Exception:
           # print("Failure at frame_nr:", i)

    # Create movie dictionary
    movie_dict = {
            'saturation': saturation_list,
            'brightness': brightness_list, 
            'entropy': entropy_list, 
            'sharpness': sharpness_list, 
            'contrast': contrast_list,
            'frame_nr': frame_list,
            'colorfulness': colorfulness_list,
            'dom_col': dominant_color_list,
            'movie_id': movie_id
            }

    df = pd.DataFrame(movie_dict)
    df['frame_nr'] = pd.to_numeric(df['frame_nr'], downcast='integer')
    #df = df.set_index('frame_nr')
    df = df.sort_values(by = ['frame_nr', 'movie_id'], ascending=[True, True])
    return df

        

In [21]:
# Main function to extract visual features from the movies
if __name__ == "__main__":
    movie_df = pd.DataFrame()
    trailers = './movie_keyframes/movie_trailers/tiny_Test/'
    trailers_read = os.listdir(trailers)
    #print(trailers_read)
    
    for x in trailers_read:
        folder_path = os.path.join(trailers,x)
        img_folder = os.listdir(folder_path)
        
        movie_id = x #path.replace('./movie_keyframes/trailers/', '').replace('/','')
        feature_dict = get_features(img_folder, movie_id)
        movie_df = movie_df.append(feature_dict)
    
    

In [22]:
#movie_df['frame_nr'] = pd.to_numeric(movie_df['frame_nr'], downcast='integer')
#movie_df = movie_df.sort_values(by=['frame_nr'], axis=0, ascending=True)
#movie_df.dtypes
#movie_df.to_csv('output.csv')
movie_df.head(100)

Unnamed: 0,saturation,brightness,entropy,sharpness,contrast,frame_nr,colorfulness,dom_col,movie_id
2,0.000000,0.580431,0.136885,50.896806,11.475877,0,0.000000,"[(0.0004702694753756598, 0.0004702694753756598...",tt0105690.mp4
55,21.252817,26.976065,3.718764,257.117550,25.342720,48,9.218765,"[(0.3167931935264684, 0.31380663787928753, 0.3...",tt0105690.mp4
4,7.168827,30.356037,4.289692,262.512469,28.535563,101,5.582645,"[(-0.3942233310747467, -0.3779269332396935, -0...",tt0105690.mp4
0,8.898330,33.539167,4.460732,343.555235,33.616063,161,7.385653,"[(0.3774313898176223, 0.3600278803783049, 0.36...",tt0105690.mp4
36,28.654527,25.723623,4.478536,737.720709,44.284965,211,18.353962,"[(-0.3046666558485848, -0.30827596790080875, -...",tt0105690.mp4
...,...,...,...,...,...,...,...,...,...
43,37.997600,7.120933,1.298146,69.632614,18.998112,2328,6.315043,"[(0.010385437699950364, 0.014726938622916694, ...",tt0105690.mp4
44,81.566552,11.708530,1.213764,357.462778,55.147209,2388,24.829325,"[(-0.2506942429198065, -0.17393846639905988, -...",tt0105690.mp4
45,83.487963,11.127115,1.113452,417.167207,55.066666,2448,24.058931,"[(0.3656850434461838, 0.06878865721582993, 0.2...",tt0105690.mp4
47,15.129509,26.756648,3.281468,6068.359892,87.200338,2508,6.597199,"[(0.2964485565674506, 0.28355247327447164, 0.3...",tt0105690.mp4


In [None]:
# Natural log and transform all numbers to positive numbers
movie_df['saturation'] = np.log(movie_df['saturation'] + 1 - min(movie_df['saturation']))
movie_df['brightness'] = np.log(movie_df['brightness'] + 1 - min(movie_df['brightness']))
movie_df['entropy'] = np.log(movie_df['entropy'] + 1 - min(movie_df['entropy']))
movie_df['sharpness'] = np.log(movie_df['sharpness'] + 1 - min(movie_df['sharpness']))
movie_df['contrast'] = np.log(movie_df['contrast'] + 1 - min(movie_df['contrast']))

movie_df.head()


<h1> Condensing all features to single rows per movie </h1>

In [None]:
# Deprecated function  
def polynomial_regression(id, df):
    temp_df = df
    temp_df['frame_nr'] = pd.to_numeric(temp_df['frame_nr'], downcast='integer')
    temp_df = temp_df.sort_values(by=['frame_nr'], axis=0, ascending=True)


    frame_nr = temp_df['frame_nr'][temp_df['movie_id'] == id]
    saturation = temp_df['saturation'][temp_df['movie_id'] == id]
    brightness = temp_df['brightness'][temp_df['movie_id'] == id]
    entropy = temp_df['entropy'][temp_df['movie_id'] == id]
    sharpness = temp_df['sharpness'][temp_df['movie_id'] == id]
    contrast = temp_df['contrast'][temp_df['movie_id'] == id]


    saturation_model_1st = np.poly1d(np.polyfit(frame_nr, saturation, 1))
    brightness_model_1st = np.poly1d(np.polyfit(frame_nr, brightness, 1))
    entropy_model_1st = np.poly1d(np.polyfit(frame_nr, entropy, 1))
    sharpness_model_1st = np.poly1d(np.polyfit(frame_nr, sharpness, 1))
    contrast_model_1st = np.poly1d(np.polyfit(frame_nr, contrast, 1))


    saturation_model_2nd = np.poly1d(np.polyfit(frame_nr, saturation, 2))
    brightness_model_2nd = np.poly1d(np.polyfit(frame_nr, brightness, 2))
    entropy_model_2nd = np.poly1d(np.polyfit(frame_nr, entropy, 2))
    sharpness_model_2nd = np.poly1d(np.polyfit(frame_nr, sharpness, 2))
    contrast_model_2nd= np.poly1d(np.polyfit(frame_nr, contrast, 2))


    
    polynomial_dict = {
        'saturation_model_1st': saturation_model_1st, 
        'brightness_model_1st': brightness_model_1st,
        'entropy_model_1st': entropy_model_1st,
        'sharpness_model_1st': sharpness_model_1st,
        'contrast_model_1st': contrast_model_1st,
        'saturation_model_2nd': saturation_model_2nd,
        'brightness_model_2nd': brightness_model_2nd,
        'entropy_model_2nd': entropy_model_2nd,
        'sharpness_model_2nd': sharpness_model_2nd,
        'contrast_model_2nd': contrast_model_2nd
    }

    print(polynomial_dict)
    #return pd.DataFrame(polynomial_dict)



    



#polynomial_regression('2', movie_df)
#poly_test.head()
#plt.scatter(frame_nr, brightness)
#plt.show

In [None]:
# Polynomial regression on the movies
# Currently all values are formatted as strings (might be changed later)
def polynomial_regression_alt(df, id):
    temp_df = df
    temp_df['frame_nr'] = pd.to_numeric(temp_df['frame_nr'], downcast='integer')
    temp_df = temp_df.sort_values(by=['frame_nr'], axis=0, ascending=True)
    output = pd.DataFrame()


    frame_nr = temp_df['frame_nr'][temp_df['movie_id'] == id]
    saturation = temp_df['saturation'][temp_df['movie_id'] == id]
    brightness = temp_df['brightness'][temp_df['movie_id'] == id]
    entropy = temp_df['entropy'][temp_df['movie_id'] == id]
    sharpness = temp_df['sharpness'][temp_df['movie_id'] == id]
    contrast = temp_df['contrast'][temp_df['movie_id'] == id]

    saturation_model_1st = str(np.poly1d(np.polyfit(frame_nr, saturation, 1)))
    brightness_model_1st = str(np.poly1d(np.polyfit(frame_nr, brightness, 1)))
    entropy_model_1st = str(np.poly1d(np.polyfit(frame_nr, entropy, 1)))
    sharpness_model_1st = str(np.poly1d(np.polyfit(frame_nr, sharpness, 1)))
    contrast_model_1st = str(np.poly1d(np.polyfit(frame_nr, contrast, 1)))

    saturation_model_2nd = str(np.poly1d(np.polyfit(frame_nr, saturation, 2)))
    brightness_model_2nd = str(np.poly1d(np.polyfit(frame_nr, brightness, 2)))
    entropy_model_2nd = str(np.poly1d(np.polyfit(frame_nr, entropy, 2)))
    sharpness_model_2nd = str(np.poly1d(np.polyfit(frame_nr, sharpness, 2)))
    contrast_model_2nd = str(np.poly1d(np.polyfit(frame_nr, contrast, 2)))


    
    polynomial_dict = {
        'movie_id': str(id),
        'saturation_model_1st': saturation_model_1st, 
        'brightness_model_1st': brightness_model_1st,
        'entropy_model_1st': entropy_model_1st,
        'sharpness_model_1st': sharpness_model_1st,
        'contrast_model_1st': contrast_model_1st,
        'saturation_model_2nd': saturation_model_2nd,
        'brightness_model_2nd': brightness_model_2nd,
        'entropy_model_2nd': entropy_model_2nd,
        'sharpness_model_2nd': sharpness_model_2nd,
        'contrast_model_2nd': contrast_model_2nd
    }

    output = output.append(polynomial_dict, ignore_index=True)
    return output


In [None]:
# New dataframe condensing each movie into a single row 
def movie_matrix(df, id):
    matrix = pd.DataFrame()

    avg_brightness = average(df['brightness'][df['movie_id'] == id])
    avg_saturation = average(df['saturation'][df['movie_id'] == id])
    avg_entropy = average(df['entropy'][df['movie_id'] == id])
    avg_sharpness = average(df['sharpness'][df['movie_id'] == id])
    avg_contrast = average(df['contrast'][df['movie_id'] == id])


    stdev_brightness = statistics.stdev((df['brightness'][df['movie_id'] == id]))
    stdev_saturation = statistics.stdev((df['saturation'][df['movie_id'] == id]))
    stdev_entropy = statistics.stdev((df['entropy'][df['movie_id'] == id]))
    stdev_sharpness = statistics.stdev((df['sharpness'][df['movie_id'] == id]))
    stdev_contrast = statistics.stdev((df['contrast'][df['movie_id'] == id]))

    mean_brightness = statistics.mean((df['brightness'][df['movie_id'] == id]))
    mean_saturation = statistics.mean((df['saturation'][df['movie_id'] == id]))
    mean_entropy = statistics.mean((df['entropy'][df['movie_id'] == id]))
    mean_sharpness = statistics.mean((df['sharpness'][df['movie_id'] == id]))
    mean_contrast = statistics.mean((df['contrast'][df['movie_id'] == id]))


    matrix_dict = {
        'movie_id': str(id),
        'avg_brightness': avg_brightness,
        'avg_saturation': avg_saturation,
        'avg_entropy': avg_entropy,
        'avg_sharpness': avg_sharpness,
        'avg_contrast': avg_contrast,
        'stdev_brightness': stdev_brightness,
        'stdev_saturation': stdev_saturation,
        'stdev_entropy': stdev_entropy,
        'stdev_sharpness': stdev_sharpness,
        'stdev_contrast': stdev_contrast,
        'mean_brightness': mean_brightness,
        'mean_saturation': mean_saturation,
        'mean_entropy': mean_entropy,
        'mean_sharpness': mean_sharpness,
        'mean_contrast': mean_contrast
    }

    matrix = matrix.append(matrix_dict, ignore_index = True)
    return matrix

In [None]:
# Test cell 
numbers = [1,1,2,3,4,5,5,5,5,5,6]
unique_numbers = unique(numbers)
print(unique_numbers)

In [None]:
# Makes sure there is no overlap between movie_id's 
unique_movie_id_list = unique(movie_df['movie_id'])
print(unique_movie_id_list)

In [None]:
# Makes two dataframes
# One containing polynomials, and one containing averages etc. 
poly_df = pd.DataFrame()
matrix_df = pd.DataFrame()

for x in unique_movie_id_list:
    poly_df = poly_df.append(polynomial_regression_alt(movie_df, x))
    
    matrix_df = matrix_df.append(movie_matrix(movie_df, x))
final_matrix = pd.merge(poly_df, matrix_df, on = 'movie_id')
final_matrix.head(10)


In [None]:
def quantile_normalize(df):
    """
    input: dataframe with numerical columns
    output: dataframe with quantile normalized values
    """
    df_sorted = pd.DataFrame(np.sort(df.values,
                                     axis=0), 
                             index=df.index, 
                             columns=df.columns)
    df_mean = df_sorted.mean(axis=1)
    df_mean.index = np.arange(1, len(df_mean) + 1)
    return df.rank(method="min").stack().astype(int).map(df_mean).unstack()

<h1> Cosine similarity </h1>

In [None]:
# Cosine similarity between two rows
def compute_cos_sim(array1, array2):
    return 1 - spatial.distance.cosine(array1, array2)

# Cosine similarity between all rows
def compute_cos_sim_all(my_array):
    n_rows = my_array.shape[0]
    cos_sim_array = np.zeros((n_rows,n_rows))
    for row1 in range(n_rows):
        for row2 in range(n_rows):
            cos_sim_array[row1,row2] = \
            compute_cos_sim(my_array[row1, :],\
                            my_array[row2, :])
    return cos_sim_array

In [None]:
# Drops the movie_id value. Can be retrieved with iloc 
matrix_no_id = matrix_df.drop('movie_id', 1)
#matrix_no_id = quantile_normalize(matrix_no_id)
# Converts dataframe to numpy array
matrix_no_id_array = matrix_no_id.to_numpy()
# Calculates cosine similarity between the movies 
cos_sim_values = compute_cos_sim_all(matrix_no_id_array)

In [None]:
# Draws heatmap of the similarities between movies 
fig = plt.figure()
ax = plt.imshow(cos_sim_values, cmap='hot')
cbar = fig.colorbar(ax)
plt.show

In [None]:
print(movie_list.iloc[28])
print(movie_list.iloc[18])

In [None]:
# Test cell for implementing polynomials in heatmap 
# Drops the movie_id value. The movie_id be retrieved with iloc 
matrix_no_id = final_matrix.drop('movie_id', 1)
# Converts dataframe to numpy array
matrix_no_id_array = matrix_no_id.to_numpy()
# Calculates cosine similarity between the movies 
#cos_sim_values = compute_cos_sim_all(matrix_no_id_array)

<h1>Data analytics</h1>

In [None]:
# Standardising movie data
standardised_movie_data = StandardScaler().fit_transform(movie_df)


In [None]:
# Elbow method
distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters = i, init='k-means++', 
                n_init=10, max_iter=300, 
                random_state=0)
    km.fit(matrix_no_id)
    distortions.append(km.inertia_)

In [None]:
# KMeans clustering using output from Elbow method
km_plus = KMeans(n_clusters = 8, init='k-means++', 
                n_init=10, max_iter=300, 
                random_state=0)
movie_fit = km_plus.fit_predict(movie_df)

In [None]:
# MiniBatchSparcePCA 
transformer = MiniBatchSparsePCA(n_components=5, batch_size=50,
                                 random_state=0)
movie_mini_pca_fit = MiniBatchSparsePCA.fit_transform()

In [None]:
# PCA 
standardised_movie_data = StandardScaler().fit_transform(movie_df)
pca = PCA(n_components = 4)
movie_pca = pca.fit_transform(standardised_movie_data)

<h1> TO DO </h1>
<p> 
    <ul>
        <li>Clustering</li>
        <li>Add deep visual features (object / action detection)</li>
        <li>Normalize all features, using log or </li>
        <li>Recommendation libraries: librec (DeepFM, FM, Wide & Deep, Youtube - recommendation) </li>
        <li> Train-test split (5 fold cross validation) </li>
        <li> precision > RMSE </li>
        <li> Correlation analysis movies ( pearson correlation) </li>
        <li> histogram difference between features </li>
        <li> more standard deviation == good!!! </li>
        <li> low cost == no ratings, high cost == more ratings </li>
        <li> high correlation, remove one </li>
        <li> random forrest, PCA </li>
</p>