# Title/Task

---

### List of tasks

  1. [Preproccessing](#1.-Preprocessing)
    * Drop rows where None or Null exist
    * Label preprocessing (genre)
    * Poster Preprocessing

*__imports__*

In [70]:
# imports ...
import numpy as np
import pandas as pd

import math
import requests
from skimage import io
from matplotlib import pyplot as plt
import matplotlib.image as mpimg

from pathlib import Path

%matplotlib inline

__helper functions__

In [3]:
file_type = ".jpg"


# gets the image from a url
def get_image_from_url(img_url):
        request = requests.head(img_url)
        if request.status_code == 200:
            return io.imread(img_url)
        else:
            return [] # make sure to handle this case if you use this method


# reads a directory of images, flattens the features to a single row then returns as a dataframe
def extract_features(features):
    new_df = []
    for feature in features:
        request = requests.head(feature)
        if request.status_code == 200:
            img = io.imread(feature)
            image_flat = img.flatten()
            new_df.append(image_flat)
    return new_df

# # helper for displaying an image
# def print_img(img, color=True):
#     plt.figure(figsize=(10, 6) )
#     if (color):
#         #plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB) ) # need to change this colorspace to not use opencv - tbd
#     else:
#         plt.imshow(img, cmap="gray") # grayscale
#     plt.show()


# more coming

*__dataset directory paths__*

In [4]:

dataset_path = "../dataset/MovieGenre.csv"

posters_images_path = "../dataset/SampleMoviePosters/"


*__import datasets__*

In [5]:

dataset_orig_df = pd.read_csv(dataset_path)

dataset_orig_df[::3000]


Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
3000,24852,http://www.imdb.com/title/tt24852,Babes in Toyland (1934),7.4,Comedy|Family|Fantasy,https://images-na.ssl-images-amazon.com/images...
6000,84302,http://www.imdb.com/title/tt84302,Maratonci trce pocasni krug (1982),9.0,Comedy|Drama,https://images-na.ssl-images-amazon.com/images...
9000,100280,http://www.imdb.com/title/tt100280,Nuns on the Run (1990),5.9,Comedy|Crime,https://images-na.ssl-images-amazon.com/images...
12000,42286,http://www.imdb.com/title/tt42286,Broken Arrow (1950),7.2,Drama|Romance|Western,https://images-na.ssl-images-amazon.com/images...
15000,1391579,http://www.imdb.com/title/tt1391579,The Dark Lurking (2009),3.4,Action|Horror|Sci-Fi,https://images-na.ssl-images-amazon.com/images...
18000,49800,http://www.imdb.com/title/tt49800,Storm Center (1956),7.1,Drama,https://images-na.ssl-images-amazon.com/images...
21000,2111478,http://www.imdb.com/title/tt2111478,Mea Maxima Culpa: Silence in the House of God ...,8.1,Documentary,https://images-na.ssl-images-amazon.com/images...
24000,477449,http://www.imdb.com/title/tt477449,MC5: Kick Out the Jams (1999),7.5,Documentary|Short|Music,https://images-na.ssl-images-amazon.com/images...
27000,57262,http://www.imdb.com/title/tt57262,Love Is a Ball (1963),6.0,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...


In [6]:
dataset_orig_df.shape

(40108, 6)

## 1. Preprocessing

---

### A.  *__drop rows where None or Null exist__*

In [53]:

dataset_edit_df = dataset_orig_df.dropna()

print(dataset_edit_df.shape) # before dropping an additional row


(39246, 6)


In [54]:
# check before drop below
#dataset_edit_df[ dataset_edit_df["imdbId"] == 1174047 ]

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
17176,1174047,http://www.imdb.com/title/tt1174047,Stella (2008),7.2,Drama,https://images-na.ssl-images-amazon.com/images...


__*drop additional rows whose image link don't work*__

In [55]:
broken_img_link_imbdID = [59255, 1174047]
#broken_row = dataset_edit_df[ dataset_edit_df["imdbId"] == 59255 ]

b = 1174047

dataset_edit_df.drop(8138, inplace=True)
dataset_edit_df.drop(17176, inplace=True)

print(dataset_edit_df.shape)


(39244, 6)


In [57]:
# check after drop above
#dataset_edit_df[ dataset_edit_df["imdbId"] == 1174047 ]

In [58]:

orig_df_size = dataset_orig_df.shape[0] # orig
mod_df_size = dataset_edit_df.shape[0] # after dropping rows

print("Number of rows dropped (contains Null/None):", orig_df_size - mod_df_size)


Number of rows dropped (contains Null/None): 864


In [59]:

dataset_edit_df[::3000]


Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
3028,120363,http://www.imdb.com/title/tt120363,Toy Story 2 (1999),7.9,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
6033,83107,http://www.imdb.com/title/tt83107,Some Kind of Hero (1982),5.6,Comedy|Crime|Drama,https://images-na.ssl-images-amazon.com/images...
9037,102984,http://www.imdb.com/title/tt102984,Stone Cold (1991),5.9,Action|Crime|Drama,https://images-na.ssl-images-amazon.com/images...
12040,402906,http://www.imdb.com/title/tt402906,Ceskí_ sen (2004),7.5,Documentary|Comedy,https://images-na.ssl-images-amazon.com/images...
15050,1023441,http://www.imdb.com/title/tt1023441,Coco Chanel & Igor Stravinsky (2009),6.4,Drama|Music|Romance,https://images-na.ssl-images-amazon.com/images...
18105,219333,http://www.imdb.com/title/tt219333,Suspicious River (2000),5.7,Drama,https://images-na.ssl-images-amazon.com/images...
21220,1820488,http://www.imdb.com/title/tt1820488,Rebelle (2012),7.1,Drama|War,https://images-na.ssl-images-amazon.com/images...
24306,41144,http://www.imdb.com/title/tt41144,Bad Boy (1949),6.7,Drama,https://images-na.ssl-images-amazon.com/images...
27386,38239,http://www.imdb.com/title/tt38239,Week-End at the Waldorf (1945),6.7,Comedy|Drama|Musical,https://images-na.ssl-images-amazon.com/images...


---

### B. __*Label preprocessing (genre)*__

__*get label column*__

In [60]:

label = dataset_edit_df["Genre"]
label[::3000]


0        Animation|Adventure|Comedy
3028     Animation|Adventure|Comedy
6033             Comedy|Crime|Drama
9037             Action|Crime|Drama
12040            Documentary|Comedy
15050           Drama|Music|Romance
18105                         Drama
21220                     Drama|War
24306                         Drama
27386          Comedy|Drama|Musical
30481                 Action|Comedy
33552               Horror|Thriller
36683                Comedy|Romance
39858                   Crime|Drama
Name: Genre, dtype: object

In [61]:

label.shape


(39244,)

__*split the label to only show one genre*__

In [62]:

#returns the first/main genre string - don't care about sub-genres...
def get_one_genre(row):
    return row.split("|")[0] # yolo

label_edit = label.apply(get_one_genre)


In [63]:

label_edit[::3000]


0          Animation
3028       Animation
6033          Comedy
9037          Action
12040    Documentary
15050          Drama
18105          Drama
21220          Drama
24306          Drama
27386         Comedy
30481         Action
33552         Horror
36683         Comedy
39858          Crime
Name: Genre, dtype: object

__*count of each genre*__

In [64]:

label_edit.value_counts()


Drama          9775
Comedy         9654
Action         4722
Documentary    3521
Crime          2546
Horror         1970
Adventure      1674
Animation      1624
Biography      1222
Thriller        397
Short           390
Western         315
Fantasy         272
Mystery         256
Family          243
Romance         203
Sci-Fi          192
Musical         115
War              65
Music            36
History          24
Film-Noir        16
Adult             7
Talk-Show         3
Sport             2
Name: Genre, dtype: int64

__*one-hot-encode labels*__

In [65]:

#one hot encoding
label_ohe = pd.get_dummies(label_edit)

label_ohe.columns


Index(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western'],
      dtype='object')

In [66]:

label_ohe[::3000] # <---- this is out label for when we do training/testing


Unnamed: 0,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3028,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6033,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9037,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12040,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
15050,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
18105,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
21220,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
24306,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
27386,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---

### C. __*Poster Preprocessing*__

*__ gather poster links __*

In [67]:

poster_links = dataset_edit_df['Poster']

poster_links.shape


(39244,)

In [68]:

sample_poster_url = poster_links[0]

print("Sample image url:\n", sample_poster_url)


Sample image url:
 https://images-na.ssl-images-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_UX182_CR0,0,182,268_AL_.jpg


In [None]:
poster_folder = "../dataset/posters/"
file_extension = ".jpg"

file_name_ids = dataset_edit_df["imdbId"] # for naming the image file

for name, link in zip(file_name_ids, poster_links):
    file_name = str(name) + file_extension
    # add check if it's already created for speedup
    check_path = Path(poster_folder + file_name)
    if not check_path.is_file():
        result = requests.get(link, stream=True)
        image = result.raw.read()
        open(poster_folder + file_name, "wb").write(image)

__*save images to disk*__