### Mount Google drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


### Upload Kaggle json

In [0]:
#@title
# Colab's file access feature
from google.colab import files

#retrieve uploaded file
uploaded = files.upload()

#print results
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 71 bytes


### Download competition files

In [0]:
!kaggle competitions download -c understanding_cloud_organization

Downloading train_images.zip to /content
100% 3.43G/3.44G [01:23<00:00, 57.7MB/s]
100% 3.44G/3.44G [01:23<00:00, 44.3MB/s]
Downloading test_images.zip to /content
100% 2.30G/2.30G [01:20<00:00, 44.3MB/s]
100% 2.30G/2.30G [01:20<00:00, 30.7MB/s]
Downloading train.csv.zip to /content
 76% 41.0M/54.2M [00:02<00:16, 856kB/s]
100% 54.2M/54.2M [00:02<00:00, 20.4MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/321k [00:00<?, ?B/s]
100% 321k/321k [00:00<00:00, 103MB/s]


### Unzip files

In [0]:
!unzip -q train_images.zip -d train_images_raw/
!unzip -q test_images.zip -d test_images_raw/
!unzip -q train.csv.zip
!ls

drive		       test_images_raw	train.csv.zip
sample_data	       test_images.zip	train_images_raw
sample_submission.csv  train.csv	train_images.zip


### Dependencies

In [0]:
import os
import cv2
import shutil
import pandas as pd
import multiprocessing as mp

In [0]:
base_path = '/content/drive/My Drive/Colab Notebooks/[Kaggle] Understanding Clouds from Satellite Images/Data/'
train_path = base_path + 'train.csv'
submission_path = base_path + 'sample_submission.csv'
hold_out_set_path = base_path + 'hold-out.csv'

### Load data

In [0]:
submission = pd.read_csv(submission_path)
hold_out_set = pd.read_csv(hold_out_set_path)

print('Train samples:', len(hold_out_set))
print('Test samples:', len(submission))

# Preprocecss data
submission['image'] = submission['Image_Label'].apply(lambda x: x.split('_')[0])
test = pd.DataFrame(submission['image'].unique(), columns=['image'])
test['set'] = 'test'
hold_out_set['set'] = 'train'

display(hold_out_set.head())

Train samples: 5525
Test samples: 14792


Unnamed: 0,image,Fish_mask,Flower_mask,Gravel_mask,Sugar_mask,Fish,Flower,Gravel,Sugar,set
0,66cda54.jpg,,,,18208 624 19608 624 21008 624 22408 624 23808 ...,0,0,0,1,train
1,61d6640.jpg,,,1349079 387 1350479 387 1351879 387 1353279 38...,373839 334 375239 334 376639 334 378039 334 37...,0,0,1,1,train
2,bb31239.jpg,29 604 1429 604 2829 604 4229 604 5629 604 702...,1692065 510 1693465 510 1694865 510 1696265 51...,,,1,1,0,0,train
3,74d06fc.jpg,,,1435419 454 1436819 454 1438219 454 1439619 45...,,0,0,1,0,train
4,f13cbe0.jpg,330457 1020 331857 1020 333257 1020 334657 102...,390661 1208 392061 1208 393461 1208 394861 120...,1629705 16 1629722 1144 1631105 16 1631122 4 1...,2561203 314 2562603 314 2564003 314 2565403 31...,1,1,1,1,train


# Parameters

In [0]:
HEIGHT = 320
WIDTH = 480
CHANNELS = 3

### Auxiliary functions

In [0]:
def preprocess_image(image_id, base_path, save_path, HEIGHT=HEIGHT, WIDTH=WIDTH):
    image = cv2.imread(base_path + image_id)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (WIDTH, HEIGHT))
    cv2.imwrite(save_path + image_id, image)
        
def preprocess_data(df, HEIGHT=HEIGHT, WIDTH=WIDTH):
    df = df.reset_index()
    for i in range(df.shape[0]):
        item = df.iloc[i]
        image_id = item['image']
        item_set = item['set']
        if item_set == 'train':
            preprocess_image(image_id, train_base_path, train_images_dest_path)
        if item_set == 'test':
            preprocess_image(image_id, test_base_path, test_images_dest_path)

def pre_process_set(df, preprocess_fn):
    n_cpu = mp.cpu_count()
    df_n_cnt = df.shape[0]//n_cpu
    pool = mp.Pool(n_cpu)
    
    dfs = [df.iloc[df_n_cnt*i:df_n_cnt*(i+1)] for i in range(n_cpu)]
    dfs[-1] = df.iloc[df_n_cnt*(n_cpu-1):]
    res = pool.map(preprocess_fn, [x_df for x_df in dfs])
    pool.close()

## Pre-process data

In [0]:
train_base_path = 'train_images_raw/'
test_base_path = 'test_images_raw/'
train_images_dest_path = 'train_images/'
test_images_dest_path = 'test_images/'

# Making sure directories don't exist
if os.path.exists(train_images_dest_path):
    shutil.rmtree(train_images_dest_path)
if os.path.exists(test_images_dest_path):
    shutil.rmtree(test_images_dest_path)
    
# Creating train, validation and test directories
os.makedirs(train_images_dest_path)
os.makedirs(test_images_dest_path)
    
# Pre-procecss train set
pre_process_set(hold_out_set, preprocess_data)

# Pre-procecss test set
pre_process_set(test, preprocess_data)

In [0]:
!ls

drive		       test_images	train.csv      train_images_raw
sample_data	       test_images_raw	train.csv.zip  train_images.zip
sample_submission.csv  test_images.zip	train_images


In [0]:
!zip -qr '/content/drive/My Drive/Colab Notebooks/[Kaggle] Understanding Clouds from Satellite Images/Data/train_images320x480.zip' train_images
!zip -qr '/content/drive/My Drive/Colab Notebooks/[Kaggle] Understanding Clouds from Satellite Images/Data/test_images320x480.zip' test_images