# **Preprocessing - Movie Dataset**

# Step 1: Environment Setup

## Install packages

In [1]:
!pip install opencv-Python              # OpenCV
!pip install tmdbv3api                  # TMDB API
!pip install -U azureml-fsspec mltable  # Azure ML filesystem

Requirement already up-to-date: azureml-fsspec in /anaconda/envs/azureml_py38/lib/python3.8/site-packages (0.1.0b3)
Requirement already up-to-date: mltable in /anaconda/envs/azureml_py38/lib/python3.8/site-packages (1.0.0)


## Import libraries

In [7]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
import io
import ast
import cv2
import requests
from bs4 import BeautifulSoup

# Azure
from azure.storage.blob import BlobServiceClient
from azureml.fsspec import AzureMachineLearningFileSystem

# TMDB API
from tmdbv3api import TMDb
from tmdbv3api import Movie

# Scikit learn
from sklearn.model_selection import train_test_split

## Set variables

In [8]:
# Azure Blob Container info
storage_account_url = 'https://umbcmlstorage.blob.core.windows.net/'
storage_account_key = 'Cj2gFlo6CTUhwnQIXEuun0i0NX0Yn1uid3AGrNJ62PKZ6rGXpTUEovv+EcN2jX0sQwxzFvpwGbFZ+AStJJdFuQ=='
blob_service_client_instance = BlobServiceClient(account_url = storage_account_url, credential = storage_account_key)

# AzureML workspace details
subscription = '9e456110-c6ac-44a7-81cf-5b26a6935c02'
resource_group = 'umbc-machine-learning'
workspace = 'umbc-ml-workspace'
datastore_name = 'azuremldatastore'
datastore_path = f'azureml://subscriptions/{subscription}/resourcegroups/{resource_group}/workspaces/{workspace}/datastores/{datastore_name}'

# IMDB API info
tmdb = TMDb()
tmdb.api_key = '42805a1d321210f77a2812dbcaaffdf5'

## Set styles

In [9]:
# Define styles
style.use('seaborn-poster')
style.use('ggplot')

# Step 2: Get Metadata

## Read movies metadata

### Read movie metadata into dataframe

In [10]:
# Read movie metadata into dataframe
data = pd.read_csv(f'{datastore_path}/paths/movies_metadata.csv')
data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Filter out the title, overview, genres and movie id
Picking out the title, overview, genres and movie id from the original csv. The ID was picked to find the exact movie poster on TMDB to download.

In [11]:
# Filtered dataset
movie_data = data.filter(['original_title','overview', 'id', 'genres'], axis=1)
print(f' Data count: {len(movie_data)}')

# List of genres
genres = movie_data['genres'].tolist()

 Data count: 45466


## Segregrate genres

### Create genre dictionary

### Remove invalid genres
Some genres seem to occur only once in the entire dataset. Hence, we remove those.

In [12]:
# Invalid genres list
invalid_genres = ['Aniplex', 'BROSTA TV', 'Carousel Productions', 'GoHands',
                  'Mardock Scramble Production Committee', 'Odyssey Media',
                  'Pulser Productions', 'Rogue State', 'Sentai Filmworks',
                  'Telescene Film Group Productions', 'The Cartel', 'Vision View Entertainment', 
                  'TV Movie', 'Foreign']

# Create genre dictionary
genre_per_movie = []
genre_dict = {}

for genre in genres:
  genre = ast.literal_eval(genre)
  genre_list = []

  for val in genre:
    if val['name'] not in invalid_genres:
      genre_list.append(val['name'])

      if val['name'] in genre_dict:
        genre_dict[val['name']] += 1
      else:
        genre_dict[val['name']] = 1

  genre_per_movie.append(genre_list)


# Droping genres of single movie
genre_dict = {key:val for key, val in genre_dict.items() if val != 1}

# Genre count
print(f'Genre count: {len(genre_per_movie)}\n')
genre_dict


Genre count: 45466



{'Animation': 1935,
 'Comedy': 13182,
 'Family': 2770,
 'Adventure': 3496,
 'Fantasy': 2313,
 'Romance': 6735,
 'Drama': 20265,
 'Action': 6596,
 'Crime': 4307,
 'Thriller': 7624,
 'Horror': 4673,
 'History': 1398,
 'Science Fiction': 3049,
 'Mystery': 2467,
 'War': 1323,
 'Music': 1598,
 'Documentary': 3932,
 'Western': 1042}

## Update metadata

### Add genre list to metadata

In [23]:
# Add genre list in dataframe
movie_data['genre_list'] = genre_per_movie
movie_data.head()

Unnamed: 0,original_title,overview,id,genres,genre_list
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[Animation, Comedy, Family]"
1,Jumanji,When siblings Judy and Peter discover an encha...,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[Romance, Comedy]"
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[Comedy, Drama, Romance]"
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,11862,"[{'id': 35, 'name': 'Comedy'}]",[Comedy]


### Remove movies with empty genre

In [24]:
# Total movies
print(f'Total movies : {len(movie_data)}')

# Removing any rows where the genres column is empty
movie_data = movie_data[movie_data['genre_list'].map(lambda d: len(d)) > 0]
print(f'Total movies with genres : {len(movie_data)}')

# Check movies with empty genre
total_empty = 0
for genre in genre_per_movie:
  if not len(genre):
    total_empty += 1

print(f'Total movies with empty genre : {total_empty}\n')

Total movies : 45466
Total movies with genres : 42995
Total movies with empty genre : 2471



# Step 3: Download Posters

Function to download and save posters
<br>An error is inserted as the poster path if the movie ID cannot be identified or does not have a poster so that we can drop those records later. 
<br>Following the download of each poster, the corresponding poster paths are added to the dataset.

In [25]:
def get_posters(url, name):

  try:
    baseurl = 'http://image.tmdb.org/t/p/w185'
    link = baseurl + url
    print(link)

    img = requests.get(link).content
    name = name.replace('/', '_')

    # Upload to Azure Blob Storage
    file_name = str(name) + ".jpg"
    tmdb_poster_container = 'imdbdataset'
    tmdb_poster_instance = blob_service_client_instance.get_blob_client(tmdb_poster_container, file_name)
    tmdb_poster_instance.upload_blob(img)  

    return tmdb_poster_instance.url

  except Exception as e:
    print('[ERROR]', str(e))
    return "Error"

In [35]:
poster_paths = []
movie = Movie()

i = 1
for id, name in zip(movie_data['id'].tolist(), movie_data['original_title'].tolist()):
  
  try:
    print(id, name)
    m = movie.details(int(id))
    print('Downloading:', name, "i", i)
    poster_paths.append(get_posters(m.poster_path, name))
    
  except Exception as e:
    print('[ERROR]', str(e))
    poster_paths.append("API Error")
    
  i += 1

print(len(poster_paths))

862 Toy Story
Downloading: Toy Story i 1
http://image.tmdb.org/t/p/w185/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
8844 Jumanji
Downloading: Jumanji i 2
http://image.tmdb.org/t/p/w185/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg
15602 Grumpier Old Men
Downloading: Grumpier Old Men i 3
http://image.tmdb.org/t/p/w185/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg
31357 Waiting to Exhale
Downloading: Waiting to Exhale i 4
http://image.tmdb.org/t/p/w185/4uw6HKq4vlhrSVp0zkgd4zCy4Pf.jpg
11862 Father of the Bride Part II
Downloading: Father of the Bride Part II i 5
http://image.tmdb.org/t/p/w185/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg
949 Heat
Downloading: Heat i 6
http://image.tmdb.org/t/p/w185/umSVjVdbVwtx5ryCA2QXL44Durm.jpg
11860 Sabrina
Downloading: Sabrina i 7
http://image.tmdb.org/t/p/w185/z1oNjotUI7D06J4LWQFQzdIuPnf.jpg
45325 Tom and Huck
Downloading: Tom and Huck i 8
http://image.tmdb.org/t/p/w185/vIG8hWOa7DyLMRiurzKwVAnIYoU.jpg
9091 Sudden Death
Downloading: Sudden Death i 9
http://image.tmdb.org/t/p/w185/1pylO6YX5XdOA6QCc5IRxrrf

## Add poster paths to metadata

In [None]:
# Add poster paths to metadata
movie_data['poster_paths'] = poster_paths
movie_data.head()

## Remove movies with invalid poster paths

In [232]:
# Removing all rows where the poster_paths are invalid
print(f'Overall poster paths : {len(movie_data)}')

# Invalid paths
invalid_paths = len(movie_data[(movie_data['poster_paths'] == "API Error") | (movie_data['poster_paths'] == "Error")])
print(f"Invalid paths : {invalid_paths}")

# Valid paths
movie_data = movie_data[movie_data['poster_paths'] != "API Error"]
movie_data = movie_data[movie_data['poster_paths'] != "Error"]
print(f'Valid paths : {len(movie_data)}')

movie_data.head()

Overall poster paths : 42995
Invalid paths : 3007
Valid paths : 39988


Unnamed: 0,original_title,overview,id,genres,genre_list,poster_paths
0,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]","[Animation, Comedy, Family]",https://umbcmlstorage.blob.core.windows.net/imdbdataset/Toy%20Story.jpg
1,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.",8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]","[Adventure, Fantasy, Family]",https://umbcmlstorage.blob.core.windows.net/imdbdataset/Jumanji.jpg
2,Grumpier Old Men,"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.",15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]","[Romance, Comedy]",https://umbcmlstorage.blob.core.windows.net/imdbdataset/Grumpier%20Old%20Men.jpg
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.",31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]","[Comedy, Drama, Romance]",https://umbcmlstorage.blob.core.windows.net/imdbdataset/Waiting%20to%20Exhale.jpg
4,Father of the Bride Part II,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.",11862,"[{'id': 35, 'name': 'Comedy'}]",[Comedy],https://umbcmlstorage.blob.core.windows.net/imdbdataset/Father%20of%20the%20Bride%20Part%20II.jpg


## Save metadata file to datastore

In [None]:
# Write filtered data into datastore
output = io.StringIO()
output = movie_data.to_csv()

ml_container = 'azureml'
dataset_file = 'dataset_mod.csv'

dataset_mod_instance = blob_service_client_instance.get_blob_client(ml_container, dataset_file)
dataset_mod_instance.upload_blob(output)

## Read metadata from file

In [5]:
# Read filtered data from blob storage
movie_data = pd.read_csv(f'{datastore_path}/paths/dataset_mod.csv')
movie_data = movie_data.iloc[:,1:]
movie_data.head()

Unnamed: 0,original_title,overview,id,genres,genre_list,poster_paths
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","['Animation', 'Comedy', 'Family']",https://umbcmlstorage.blob.core.windows.net/im...
1,Jumanji,When siblings Judy and Peter discover an encha...,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","['Adventure', 'Fantasy', 'Family']",https://umbcmlstorage.blob.core.windows.net/im...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","['Romance', 'Comedy']",https://umbcmlstorage.blob.core.windows.net/im...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","['Comedy', 'Drama', 'Romance']",https://umbcmlstorage.blob.core.windows.net/im...
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,11862,"[{'id': 35, 'name': 'Comedy'}]",['Comedy'],https://umbcmlstorage.blob.core.windows.net/im...


In [6]:
# Dropping 'Drama' and 'Comedy' records to maintain the overall average
movie_data.drop(movie_data[movie_data['genre_list'].str.contains('Drama')].head(13000).index, inplace = True)
movie_data.drop(movie_data[movie_data['genre_list'].str.contains('Comedy')].head(5000).index, inplace = True)

# Step 4: Split data into Train/Test/Validation sets

In [7]:
# Splitting the data into training, testing and validation sets
train, test = train_test_split(movie_data, test_size=0.2)
train, val = train_test_split(train, test_size=0.1)

## Save split dataset files into blob storage

In [9]:
# Write test, train and validation data into datastore
ml_container = 'azureml'
df_list = [val, test, train]
df_name_list = ['val', 'test', 'train']

for item, item_name in zip(df_list, df_name_list):
    output = io.StringIO()
    output = item.to_csv()
    data_file = item_name+'_data.csv'
    data_instance = blob_service_client_instance.get_blob_client(ml_container, data_file)
    data_instance.upload_blob(output)


# Step 5: Resize images

## Resize poster images

In [10]:
def resize_img(title):

  blob_name = title+'.jpg'
  blob = blob_service_client_instance.get_blob_client('imdbdataset', blob_name)

  try:
    with open(blob_name, "wb") as my_blob:
      blob_data = blob.download_blob()
      blob_data.readinto(my_blob)
      
    # Resize images using OpenCV
    img = cv2.imread(blob_name)
    img = cv2.resize(img, (75, 115))
    img = img.astype(np.float32)/255
    return img

  except Exception as e:
    print(str(e))
    return None

In [11]:
# Resize validation images
val_imgs = []
i = 0

for index, row in val.iterrows():
  if i%100 == 0:
    print("Processing i:", i, str(row['poster_paths']))

  img = resize_img(str(row['original_title']))

  # Resize if poster is present else drop record
  if img is not None:
    val_imgs.append(img)
  else:
    val = val.drop(index)
    print("Dropping row:", index, "Length:", len(val))

  i += 1

assert len(val) == len(val_imgs)
val_np_imgs = np.array(val_imgs)
val_np_imgs.shape

Processing i: 0 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Before%20Stonewall.jpg
Processing i: 100 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Blood%20Freak.jpg
Processing i: 200 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Hummingbird.jpg
Processing i: 300 https://umbcmlstorage.blob.core.windows.net/imdbdataset/The%20Thing%20That%20Couldn%27t%20Die.jpg
[Errno 2] No such file or directory: 'Girl Walk // All Day.jpg'
Dropping row: 18749 Length: 1758
[Errno 2] No such file or directory: 'a/k/a Tommy Chong.jpg'
Dropping row: 11999 Length: 1757
Processing i: 400 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Agyaat.jpg
Processing i: 500 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Don%20McKay.jpg
Processing i: 600 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Crisis%20Hotline%3A%20Veterans%20Press%201.jpg
Processing i: 700 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Barbie%3A%20A%20Fashion%20Fairytale.jpg
Proces

(1754, 115, 75, 3)

In [12]:
# Resize test images
test_imgs = []
i = 0

for index, row in test.iterrows():
  if i%100 == 0:
    print("Processing i:", i, str(row['poster_paths']))

  img = resize_img(str(row['original_title']))

  # Resize if poster is present else drop record
  if img is not None:
    test_imgs.append(img)
  else:
    test = test.drop(index)
    print("Dropping row:", index, "Length:", len(test))

  i += 1


assert len(test) == len(test_imgs)
test_np_imgs = np.array(test_imgs)
test_np_imgs.shape

Processing i: 0 https://umbcmlstorage.blob.core.windows.net/imdbdataset/%D0%94%D0%BD%D0%B5%D0%B2%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%B8%D1%80%D0%B5%D0%BA%D1%82%D0%BE%D1%80%D0%B0%20%D1%88%D0%BA%D0%BE%D0%BB%D1%8B.jpg
Processing i: 100 https://umbcmlstorage.blob.core.windows.net/imdbdataset/They%20Nest.jpg
Processing i: 200 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Kill%20Squad.jpg
Processing i: 300 https://umbcmlstorage.blob.core.windows.net/imdbdataset/The%20Thompsons.jpg
Processing i: 400 https://umbcmlstorage.blob.core.windows.net/imdbdataset/%E0%B8%99%E0%B8%B2%E0%B8%87%E0%B9%84%E0%B8%A1%E0%B9%89.jpg
Processing i: 500 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Rewind.jpg
Processing i: 600 https://umbcmlstorage.blob.core.windows.net/imdbdataset/%E3%82%B0%E3%82%B9%E3%82%B3%E3%83%BC%E3%83%96%E3%83%89%E3%83%AA%E3%81%AE%E4%BC%9D%E8%A8%98.jpg
Processing i: 700 https://umbcmlstorage.blob.core.windows.net/imdbdataset/12%20Days%20of%20Christmas%20Eve.jpg
Processing i: 800 

(4393, 115, 75, 3)

In [13]:
# Resize train images
train_imgs = []
i = 0

for index, row in train.iterrows():
  if i%100 == 0:
    print("Processing i:", i, str(row['poster_paths']))

  img = resize_img(str(row['original_title']))
  
  # Resize if poster is present else drop record
  if img is not None:
    train_imgs.append(img)
  else:
    train = train.drop(index)
    print("Dropping row:", index, "Length:", len(train))
  i += 1

assert len(train) == len(train_imgs)
train_np_imgs = np.array(train_imgs)
train_np_imgs.shape

Processing i: 0 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Zaytoun.jpg
Processing i: 100 https://umbcmlstorage.blob.core.windows.net/imdbdataset/%E9%97%98%E7%A5%9E%E4%BC%9D.jpg
Processing i: 200 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Zoo.jpg
Processing i: 300 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Assassins%20Run.jpg
[Errno 2] No such file or directory: 'HK/変態仮面.jpg'
Dropping row: 29423 Length: 15830
Processing i: 400 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Come%20non%20detto.jpg
Processing i: 500 https://umbcmlstorage.blob.core.windows.net/imdbdataset/%E3%83%95%E3%82%A3%E3%83%BC%E3%83%A1%E3%82%A4%E3%83%AB.jpg
Processing i: 600 https://umbcmlstorage.blob.core.windows.net/imdbdataset/La%20tumba%20de%20los%20muertos%20vivientes.jpg
Processing i: 700 https://umbcmlstorage.blob.core.windows.net/imdbdataset/Joe%20Cocker%20-%20Mad%20Dogs%20%26%20Englishmen.jpg
Processing i: 800 https://umbcmlstorage.blob.core.windows.net/imdbdata

(15796, 115, 75, 3)

## Save updated metadata

In [14]:
# Write test, train and validation data into blob storage
ml_container = 'azureml'
df_list = [val, test, train]
df_name_list = ['val', 'test', 'train']

for item, item_name in zip(df_list, df_name_list):
    output = io.StringIO()
    output = item.to_csv()
    data_file = item_name+'_data.csv'
    data_instance = blob_service_client_instance.get_blob_client(ml_container, data_file)
    data_instance.upload_blob(output, overwrite=True)

## Save numpy metadata

In [16]:
np.save('val_np_imgs.npy', val_np_imgs)
np.save('test_np_imgs.npy', test_np_imgs)
np.save('train_np_imgs.npy', train_np_imgs)