Create train and evaluate split of labeled files and write to CSVs

In [27]:
import pandas as pd
import matplotlib.pyplot as plt

In [69]:
# Import data
df = pd.read_csv('csv/all_data.csv', header=None)
df.columns = ['filename', 'label']
df.head()

Unnamed: 0,filename,label
0,gs://reliable-realm-222318-vcm/satellite_imgs/...,ship
1,gs://reliable-realm-222318-vcm/satellite_imgs/...,ship
2,gs://reliable-realm-222318-vcm/satellite_imgs/...,ship
3,gs://reliable-realm-222318-vcm/satellite_imgs/...,ship
4,gs://reliable-realm-222318-vcm/satellite_imgs/...,ship


In [70]:
# Total class dist. 
df['label'].value_counts()

no_ship    3000
ship       1000
Name: label, dtype: int64

In [71]:
# Split DataFrame
ship = df[df['label'] == 'ship']
no_ship = df[df['label'] == 'no_ship']

In [72]:
# shuffle rows
ship = ship.sample(frac=1).reset_index(drop=True)
no_ship = no_ship.sample(frac=1).reset_index(drop=True)

In [94]:
# Calculate validation rows
valid_pct = 0.25
num_ship_valid = int(ship.shape[0] * valid_pct)
num_noship_valid = int(no_ship.shape[0] * valid_pct)

# validation split
valid_df = pd.DataFrame()
valid_df = valid_df.append(ship.iloc[:num_ship_valid])
valid_df = valid_df.append(no_ship.iloc[:num_noship_valid])
valid_df = valid_df.reset_index(drop=True)
valid_df = valid_df.sample(frac=1) # shuffle again!
valid_df['label'].value_counts()

# train split
train_df = pd.DataFrame()
train_df = train_df.append(ship.iloc[num_ship_valid:])
train_df = train_df.append(no_ship.iloc[num_noship_valid:])
train_df = train_df.sample(frac=1)
train_df = train_df.reset_index(drop=True)

# Class counts for each
print('valid class:')
print(valid_df['label'].value_counts())
print()
print('train class:')
print(train_df['label'].value_counts())

valid class:
no_ship    750
ship       250
Name: label, dtype: int64

train class:
no_ship    2250
ship        750
Name: label, dtype: int64


In [95]:
# data frame info
train_df.info()
print()
valid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
filename    3000 non-null object
label       3000 non-null object
dtypes: object(2)
memory usage: 47.0+ KB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 487 to 365
Data columns (total 2 columns):
filename    1000 non-null object
label       1000 non-null object
dtypes: object(2)
memory usage: 23.4+ KB


In [100]:
# Write back to CSV
train_df.to_csv('csv/train_data.csv', index=False, header=False)  # will overwrite
valid_df.to_csv('csv/valid_data.csv', index=False, header=False)

In [101]:
# !rm csv/train_imgs.csv

In [102]:
!ls -l ./csv

total 2064
-rw-r--r--  1 ericbragas  staff  510394 Nov 16 10:58 all_data.csv
-rw-r--r--  1 ericbragas  staff  382776 Nov 16 11:24 train_data.csv
-rw-r--r--  1 ericbragas  staff  127618 Nov 16 11:24 valid_data.csv


In [103]:
!head ./csv/train_data.csv

gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20170724_181036_1011__-122.3396918637242_37.76771252868267.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/ship/1__20170921_181406_1031__-122.3378493393171_37.73749919914582.png,ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20180712_180429_101b__-118.06740213868301_33.67264908661255.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20170821_175214_101d__-118.28264453650422_33.737050666114996.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20161218_180846_0e26__-122.38098418888359_37.66432089876209.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20170730_181513_0e20__-122.33130488949227_37.74965703015818.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20170722_181118_101f__-122.43587377363853_37.75544959006133.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20171207_181550_103c__-122.2040404055541_37.829

In [104]:
!head csv/valid_data.csv

gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20180705_213444_0f02__-122.33424070486853_37.72604117552472.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20170616_180825_100e__-122.3366245042188_37.760286524892685.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/ship/1__20170702_181120_103a__-122.34307250130168_37.7205205808024.png,ship
gs://reliable-realm-222318-vcm/satellite_imgs/ship/1__20171118_185722_0f2d__-122.33606293980446_37.7583365292858.png,ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20171118_181532_1030__-122.33738353565393_37.7362798061611.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20170709_181333_0e0e__-122.16539336184846_37.671050191042916.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20170708_180811_100c__-122.01097475696292_37.65679639945837.png,no_ship
gs://reliable-realm-222318-vcm/satellite_imgs/no_ship/0__20170730_181043_103d__-122.34541210935261_37.760227505

In [110]:
%%bash
gsutil ls gs://reliable-realm-222318-vcm/satellite_imgs/csv/

gs://reliable-realm-222318-vcm/satellite_imgs/csv/all_data.csv


In [111]:
%%bash
gsutil rm gs://reliable-realm-222318-vcm/satellite_imgs/csv/train_data.csv
gsutil rm gs://reliable-realm-222318-vcm/satellite_imgs/csv/valid_data.csv

CommandException: No URLs matched: gs://reliable-realm-222318-vcm/satellite_imgs/csv/train_data.csv
CommandException: No URLs matched: gs://reliable-realm-222318-vcm/satellite_imgs/csv/valid_data.csv


In [114]:
%%bash
gsutil cp ./csv/train_data.csv gs://reliable-realm-222318-vcm/satellite_imgs/csv/
gsutil cp ./csv/valid_data.csv gs://reliable-realm-222318-vcm/satellite_imgs/csv/
    
gsutil ls gs://reliable-realm-222318-vcm/satellite_imgs/csv/

gs://reliable-realm-222318-vcm/satellite_imgs/csv/all_data.csv
gs://reliable-realm-222318-vcm/satellite_imgs/csv/train_data.csv
gs://reliable-realm-222318-vcm/satellite_imgs/csv/valid_data.csv


Copying file://./csv/train_data.csv [Content-Type=text/csv]...
/ [1 files][373.8 KiB/373.8 KiB]                                                
Operation completed over 1 objects/373.8 KiB.                                    
Copying file://./csv/valid_data.csv [Content-Type=text/csv]...
/ [1 files][124.6 KiB/124.6 KiB]                                                
Operation completed over 1 objects/124.6 KiB.                                    
