[View in Colaboratory](https://colab.research.google.com/github/davidkorea/google_colaboratory/blob/master/TGS_Salt_Identification_Challenge.ipynb)

In [0]:
#read/write image data
!pip install imageio
#deep learning library
!pip install torch
#access kaggle datasets from colab
!pip install kaggle
#model loading
!pip install ipywidgets

In [0]:
#File input output
import os
#matrix math
import numpy as np
#read/write image data
import imageio
#visualize data
import matplotlib.pyplot as plt
#data preprocessing 
import pandas as pd
#deep learning
import torch
#just in case we need a backup datasets
from torch.utils import data
#will output the plot right below the cell that produces it
%matplotlib inline

In [19]:
# 查看目录下所有文件所占用的空间
!ls -lha

total 24K
drwxr-xr-x 1 root root 4.0K Sep 10 14:25 .
drwxr-xr-x 1 root root 4.0K Sep 10 12:49 ..
drwxr-xr-x 4 root root 4.0K Sep  6 17:27 .config
-rw-r--r-- 1 root root   64 Sep 10 14:25 kaggle (1).json
-rw-r--r-- 1 root root   64 Sep 10 12:58 kaggle.json
drwxr-xr-x 2 root root 4.0K Sep  6 17:46 sample_data


In [0]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so lets move it there.
# 如果用-p 可以直接创建2个目录 mkdir -p A/B(如果父目录A不存在就创建)
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

In [26]:
!ls /root/.kaggle/

kaggle.json


In [27]:
!pwd

/content


In [30]:
#lets now download our dataset
!kaggle competitions download -c tgs-salt-identification-challenge

Downloading depths.csv to /content
  0% 0.00/322k [00:00<?, ?B/s]
100% 322k/322k [00:00<00:00, 44.9MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/264k [00:00<?, ?B/s]
100% 264k/264k [00:00<00:00, 66.6MB/s]
Downloading train.csv to /content
  0% 0.00/922k [00:00<?, ?B/s]
100% 922k/922k [00:00<00:00, 92.0MB/s]
Downloading test.zip to /content
 92% 150M/163M [00:00<00:00, 206MB/s]
100% 163M/163M [00:00<00:00, 217MB/s]
Downloading train.zip to /content
 76% 29.0M/37.9M [00:00<00:00, 67.8MB/s]
100% 37.9M/37.9M [00:00<00:00, 167MB/s] 


In [0]:
!ls
!unzip train.zip

In [35]:
!ls

depths.csv  kaggle (1).json  masks	  sample_submission.csv  train.csv
images	    kaggle.json      sample_data  test.zip		 train.zip


In [0]:
#lets create a class to represent this data, to make it easier to access

class TGSSaltDataset(data.Dataset):
    #init with the location of the dataset, and the list of file 
    def __init__(self, root_path, file_list):
        self.root_path = root_path
        self.file_list = file_list
    #get method - how long is the list
    def __len__(self):
        return len(self.file_list)
    #get method - return the seismic image + label for a given index
    def __getitem__(self, index):
        #if the index is out of bounds, get a random image
        if index not in range(0, len(self.file_list)):
            return self.__getitem__(np.random.randint(0, self.__len__()))
        #define a file ID using the index parameter
        file_id = self.file_list[index]
        #image folder + path
        image_folder = os.path.join(self.root_path, "images")
        image_path = os.path.join(image_folder, file_id + ".png")
        #label folder + path
        mask_folder = os.path.join(self.root_path, "masks")
        mask_path = os.path.join(mask_folder, file_id + ".png")
        #read it, store it in memory as a byte array
        image = np.array(imageio.imread(image_path), dtype=np.uint8)
        mask = np.array(imageio.imread(mask_path), dtype=np.uint8)
        #return image + label
        return image, mask

In [0]:
depths_df = pd.read_csv('depths.csv')
train_df = pd.read_csv('train.csv')

In [37]:
depths_df.head()

Unnamed: 0,id,z
0,4ac19fb269,306
1,1825fadf99,157
2,f59821d067,305
3,5b435fad9d,503
4,e340e7bfca,783


In [39]:
train_df.head()

Unnamed: 0,id,rle_mask
0,575d24d81d,
1,a266a2a9df,5051 5151
2,75efad62c1,9 93 109 94 210 94 310 95 411 95 511 96 612 96...
3,34e51dba6a,48 54 149 54 251 53 353 52 455 51 557 50 659 4...
4,4875705fb0,1111 1 1212 1 1313 1 1414 1 1514 2 1615 2 1716...


In [43]:
depths_df.loc[depths_df['id']=='a266a2a9df']

Unnamed: 0,id,z
2332,a266a2a9df,794


In [0]:
merge_df= train_df.merge(depths_df)

In [42]:
merge_df

Unnamed: 0,id,rle_mask,z
0,575d24d81d,,843
1,a266a2a9df,5051 5151,794
2,75efad62c1,9 93 109 94 210 94 310 95 411 95 511 96 612 96...,468
3,34e51dba6a,48 54 149 54 251 53 353 52 455 51 557 50 659 4...,727
4,4875705fb0,1111 1 1212 1 1313 1 1414 1 1514 2 1615 2 1716...,797
5,782ae9b7e7,1 1815 1819 90 1920 81 2021 73 2122 64 2223 55...,677
6,9842f69f8d,,907
7,aa94cfb806,1 28 102 28 203 29 304 30 405 32 506 33 607 34...,754
8,50d3073821,1 2121 9293 909,810
9,28f865caaa,,147
