In [10]:
import pandas as pd
import numpy as np
import io
import sys
import os.path
import urllib.request
from tqdm import tqdm
from os import listdir
from PIL import Image
import glob

pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=sys.maxsize)

In [4]:
df = pd.read_csv("MovieGenre_final.csv",encoding='ISO-8859-1')
df.head()

Unnamed: 0,Image_Paths,imdbId,Genre,Title
0,Posters/27428.jpg,27428,Crime|Mystery|Drama,The Case of the Black Cat (1936)
1,Posters/972555.jpg,972555,Crime|Drama|Thriller,Exhibit A (2007)
2,Posters/1424062.jpg,1424062,Documentary,The Thorn in the Heart (2009)
3,Posters/1185371.jpg,1185371,Documentary,Corman's World: Exploits of a Hollywood Rebel (2011)
4,Posters/113369.jpg,113369,Drama,I Shot a Man in Vegas (1995)


In [5]:
df.shape

(36918, 4)

In [8]:
# There exists a genre value for all images
Genre_list = df['Genre']
print(len(Genre_list))

36918


In [9]:
# Breaks "Genre" into the constituting individual genres
def find_genres(genre):
    
    start = 0
    set_of_genre = []
    for i in range(len(genre)):
        
        k=0
        substring = ""
        if (genre[i]=='|'):
            substring = genre[start:i]
            start = i+1
            k = 1
        
        if(i==len(genre)-1):
            substring = genre[start:i+1]
            k = 1
            
        if (k==1):
            set_of_genre.append(substring)         
    
    return (set_of_genre)

In [12]:
# Extract list of genre values for each image
all_genre = []
Genre_list = df['Genre']

for i in range (len(Genre_list)):
    
    set_of_genre = find_genres(Genre_list[i])
    
    for j in range (len(set_of_genre)):
        all_genre.append(set_of_genre[j])
        
uniq, counts = np.unique(all_genre, return_counts=True)
print("Number of unique genres:", len(uniq))
print("Unique genres are:", uniq)
dict(zip(uniq, counts))

Number of unique genres: 28
Unique genres are: ['Action' 'Adult' 'Adventure' 'Animation' 'Biography' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Family' 'Fantasy' 'Film-Noir' 'Game-Show'
 'History' 'Horror' 'Music' 'Musical' 'Mystery' 'News' 'Reality-TV'
 'Romance' 'Sci-Fi' 'Short' 'Sport' 'Talk-Show' 'Thriller' 'War' 'Western']


{'Action': 4960,
 'Adult': 8,
 'Adventure': 3563,
 'Animation': 1609,
 'Biography': 1852,
 'Comedy': 11779,
 'Crime': 4874,
 'Documentary': 3451,
 'Drama': 18632,
 'Family': 1949,
 'Fantasy': 1888,
 'Film-Noir': 359,
 'Game-Show': 1,
 'History': 1331,
 'Horror': 3751,
 'Music': 1194,
 'Musical': 762,
 'Mystery': 2229,
 'News': 78,
 'Reality-TV': 2,
 'Romance': 5742,
 'Sci-Fi': 1866,
 'Short': 872,
 'Sport': 650,
 'Talk-Show': 6,
 'Thriller': 4512,
 'War': 1092,
 'Western': 783}

In [13]:
# Prepare multi-hot-encoded-labels for the various genres
def multi_hot_encoded_labels(img_id, genre):
    
    col_names =  ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History',
                  'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']
    
    set_of_genre = find_genres(genre)
    
    row=[]
    row.append(img_id)
    
    for i in range(len(col_names)):
        
        found = 0
        for j in range (len(set_of_genre)):
            if (set_of_genre[j]==col_names[i]):
                found = 1
                break
        
        row.append(found)
    
    row.append(genre) #add the overall combined genre for record purposes
        
    return row  

In [16]:
# Perform the encoding of the labels and save data in the format :
# Img-ID <multi-hot-encoded-labels> overall_genre

df = pd.read_csv("MovieGenre_final.csv", encoding="ISO-8859-1")
all_data = []

for index, row in tqdm(df.iterrows()):
    
    path = row['Image_Paths']
    genre = row['Genre']
    row = multi_hot_encoded_labels(path, genre)
    
    all_data.append(row)

col_names =  ['Img-paths', 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History',
                  'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Genre']

np.savetxt("Multi_hot_encoded_data.csv", np.asarray(all_data), fmt='%s', delimiter=" ")   
np.savetxt("Encoded_data_column_lookup.csv", np.asarray(col_names), fmt='%s', delimiter=" ")

36918it [00:04, 7589.71it/s]


In [15]:
df_encoded = pd.read_csv("Multi_hot_encoded_data.csv", delimiter=" ", 
                  names =  ['Img-paths', 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History',
                  'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Genre'])

df_encoded.head()

Unnamed: 0,Img-paths,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Genre
0,Posters/27428.jpg,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,Crime|Mystery|Drama
1,Posters/972555.jpg,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,Crime|Drama|Thriller
2,Posters/1424062.jpg,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Documentary
3,Posters/1185371.jpg,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Documentary
4,Posters/113369.jpg,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,Drama


In [16]:
# To split into train / validation / test in the ratio 80 / 15 / 5%

df = pd.read_csv("Multi_hot_encoded_data.csv", delimiter=" ")
random_seed = 50
train_df = df.sample(frac=0.70, random_state=random_seed) #Taking 70% of the data
tmp_df = df.drop(train_df.index)
test_df = tmp_df.sample(frac=0.1, random_state=random_seed) #Taking 20% of the remaining (after train is taken)
valid_df = tmp_df.drop(test_df.index)

print("Train_df=",len(train_df))
print("Val_df=",len(valid_df))
print("Test_df=",len(test_df))

np.savetxt("Train.csv", train_df, fmt='%s', delimiter=" ")
np.savetxt("Test.csv", test_df, fmt='%s', delimiter=" ")
np.savetxt("Valid.csv", valid_df, fmt='%s', delimiter=" ")

#Numpy method
#train, validate, test = np.split(df_encoded.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
#np.split will split at 60% of the length of the shuffled array, 
#then 80% of length (which is an additional 20% of data), thus leaving a remaining 20% of the data.

Train_df= 25842
Val_df= 9967
Test_df= 1108
