**Import Libraries**

In [None]:
import numpy as np
import pandas as pd
import glob
import urllib
from urllib import request
from matplotlib import pyplot as plt
import requests
import os
import cv2
from collections import Counter
from IPython.display import display, Image, SVG, Math, YouTubeVideo
from tqdm.notebook import tqdm
%run utils.ipynb

**Connect to Drive**

Uncomment if using Google Colab

In [None]:
# from google.colab import drive
# from google.colab.patches import cv2_imshow
# drive.mount('/content/drive/')
# os.chdir(config.PATH)

**Loading the Dataset**

Images are not saved yet, just urls

In [None]:
path = './'
documents = ['photos', 'keywords']
datasets = {}

for doc in documents:
    files = glob.glob(path + doc + ".tsv*")

    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep = '\t', header = 0)
        subsets.append(df)

    datasets[doc] = pd.concat(subsets, axis = 0, ignore_index = True)

In [None]:
photos_df = datasets['photos'].iloc[:, :3]
print(photos_df.shape)
photos_df.head()

In [None]:
key_words_df = datasets['keywords'].iloc[:, :3]
print(key_words_df.shape)
key_words_df

In [None]:
confident_keywords = key_words_df[key_words_df.ai_service_1_confidence > 99]
print(confident_keywords.shape)
print(confident_keywords)

In [None]:
merged_photos_links_and_confident_keywords = confident_keywords.merge(datasets['photos'], 
                                                                      on = 'photo_id')
tagged_links = merged_photos_links_and_confident_keywords[['keyword', 'photo_image_url']]
most_tagged_words = Counter(tagged_links['keyword']).most_common(10)
key_words = dict(most_tagged_words).keys()
key_words = list(key_words)
commons_df = tagged_links[tagged_links['keyword'].isin(key_words)]
dummies_per_url_df = pd.get_dummies(commons_df, prefix = ['keyword'], 
                                    columns = ['keyword']).groupby(['photo_image_url']).sum()
dummies_per_url_df = dummies_per_url_df.reset_index()
dummies_per_url_df.head()

**Saving the arrays to files**

In [None]:
for i in range(13):
    test = dummies_per_url_df.iloc[(1000 * i):(1000 * (i + 1))].copy()
    tqdm.pandas()
    test['numpy_array_images'] = test['photo_image_url'].progress_apply(lambda x: image_url_to_numpy_array_urllib(x))
    test = test[test.numpy_array_images != 'Image ratio is too rectangular']
    test.iloc[:,1:11].to_pickle(f'./y{i+1}.pkl')
    test.iloc[:,-1].to_pickle(f'./X{i+1}.pkl')