[View in Colaboratory](https://colab.research.google.com/github/ebgv/Planet--Understanding-the-Amazon-from-Space/blob/master/binary_downloading_from_kaggle.ipynb)

# Loading the data from kaggle

In [0]:
# modify according to kaggle username and key

!pip install -U -q kaggle
!mkdir -p ~/.kaggle
!echo '{"username":"xxx","key":"xxx"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
!mkdir -p data

In [4]:
# train_v2.csv 

# downloading from kaggle 
% cd /content
!kaggle competitions download -c planet-understanding-the-amazon-from-space -f train_v2.csv -p data
# unzipping
% cd /content/data
!unzip train_v2.csv.zip -d /content/data

/content
Downloading train_v2.csv.zip to data
  0% 0.00/159k [00:00<?, ?B/s]
100% 159k/159k [00:00<00:00, 54.6MB/s]
/content/data
Archive:  train_v2.csv.zip
  inflating: /content/data/train_v2.csv  
   creating: /content/data/__MACOSX/
  inflating: /content/data/__MACOSX/._train_v2.csv  


In [5]:
# train-jpg.tar.7z
# this should take a little time - loading all the images 

# downloading from kaggle 
% cd /content
!kaggle competitions download -c planet-understanding-the-amazon-from-space -f train-jpg.tar.7z -p data
# unzipping 
% cd /content/data
!7z x -so train-jpg.tar.7z | tar xf - -C /content/data

/content
Downloading train-jpg.tar.7z to data
100% 599M/600M [00:05<00:00, 146MB/s]
100% 600M/600M [00:05<00:00, 112MB/s]
/content/data


# Reading the data

In [0]:
import pandas as pd

In [7]:
train_label = pd.read_csv('/content/data/train_v2.csv')
train_label.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [8]:
# creating the weather labels
weather_categories = ['partly_cloudy', 'haze', 'cloudy', 'clear']
weather_tag_list = [[element for element in tag.split() if element in weather_categories] for tag in train_label['tags']]
train_label['weather_tags'] = [''.join(tag) for tag in weather_tag_list]
train_label.head()

Unnamed: 0,image_name,tags,weather_tags
0,train_0,haze primary,haze
1,train_1,agriculture clear primary water,clear
2,train_2,clear primary,clear
3,train_3,clear primary,clear
4,train_4,agriculture clear habitation primary road,clear


In [10]:
# selecting images of clear weather only 
train_clear = train_label[train_label['weather_tags'] == 'clear'].copy()
train_clear.head()

Unnamed: 0,image_name,tags,weather_tags
1,train_1,agriculture clear primary water,clear
2,train_2,clear primary,clear
3,train_3,clear primary,clear
4,train_4,agriculture clear habitation primary road,clear
6,train_6,agriculture clear cultivation primary water,clear


In [11]:
binary_categories = ['water', 'road']
binary_tag_list = [[element for element in tag.split() if element in binary_categories] for tag in train_clear['tags']]
train_clear['binary_tags'] = [''.join(tag) for tag in binary_tag_list]
train_clear = train_clear[train_clear['binary_tags'].isin(binary_categories)]
train_clear.head()

Unnamed: 0,image_name,tags,weather_tags,binary_tags
1,train_1,agriculture clear primary water,clear,water
2,train_2,clear primary,clear,
3,train_3,clear primary,clear,
4,train_4,agriculture clear habitation primary road,clear,road
6,train_6,agriculture clear cultivation primary water,clear,water


# Creating folders by class 

In [16]:
train_clear.describe()

Unnamed: 0,image_name,tags,weather_tags,binary_tags
count,8421,8421,8421,8421
unique,8421,129,1,2
top,train_30584,clear primary water,clear,road
freq,1,1850,8421,4607


In [17]:
len(train_clear['tags'].unique()) # number of categories 

129

In [18]:
train_clear.shape # number of individuals 

(8421, 4)

In [20]:
train_clear.groupby(['binary_tags']).count().sort_values(by = 'image_name', ascending = False).head(5)

Unnamed: 0_level_0,image_name,tags,weather_tags
binary_tags,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
road,4607,4607,4607
water,3814,3814,3814


# Creating train/test folders

In [0]:
import shutil

In [0]:
def copy_files_to_folder(list_files, new_folder): # list of files and new folder name as string 
    !mkdir $new_folder
    for file in list_files: 
        shutil.copy('/content/data/train-jpg/{}.jpg'.format(file), new_folder)

In [0]:
binary_categories = ['water', 'road']

In [0]:
from random import sample

# storing the test-train split of image_names 
groups = dict()
for group_name in binary_categories:

    full_names = list(train_clear['image_name'][train_clear['binary_tags'] == group_name])
    shuffled_names = sample(full_names, k=len(full_names))
    # splitting image names into test-train
    sub_groups = dict()
    sub_groups['train_names'] = shuffled_names[:int(0.8*len(shuffled_names))]
    sub_groups['test_names'] = shuffled_names[int(0.8*len(shuffled_names))+1:]
    groups[group_name] = sub_groups

# Train

In [26]:
% cd /content/data
! mkdir train
% cd train

/content/data
/content/data/train


In [27]:
%cd /content/data/train

for group_name in binary_categories:
  image_names = groups[group_name]['train_names'] # groups are defined above 
  copy_files_to_folder(image_names, group_name)

/content/data/train


# Test

In [28]:
% cd /content/data
! mkdir test
% cd test

/content/data
/content/data/test


In [29]:
%cd /content/data/test

for group_name in binary_categories:
  image_names = groups[group_name]['test_names'] # groups are defined above 
  copy_files_to_folder(image_names, group_name)

/content/data/test


# Creating sub sample data to check the pipeline

In [30]:
% cd /content/data
! mkdir sample 

/content/data


# Train

In [31]:
% cd /content/data/sample
! mkdir train
% cd train

/content/data/sample
/content/data/sample/train


In [32]:
%cd /content/data/sample/train

sample_size = 1600 

for group_name in binary_categories:
  image_names = groups[group_name]['train_names'][:sample_size]
  copy_files_to_folder(image_names, group_name)

/content/data/sample/train


# Test

In [33]:
% cd /content/data/sample
! mkdir test
% cd test

/content/data/sample
/content/data/sample/test


In [34]:
%cd /content/data/sample/test

sample_size = 400 

for group_name in binary_categories:
  image_names = groups[group_name]['test_names'][:sample_size]
  copy_files_to_folder(image_names, group_name)

/content/data/sample/test
