## Using the Pandas Sample Method 

In [1]:
##import libraries 

#dataprep
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np 

##keras stuff 
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.preprocessing.image import ImageDataGenerator

#saving prediction output 
import pickle

#graphing
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
image_df = pd.read_csv('./data_multi_label/full_image_df.csv')
image_df = image_df.drop(columns='Unnamed: 0')
image_df.image_saved_as = image_df.image_saved_as.apply(lambda x: x +'.png')

print (image_df.shape)

image_df.head()

(2313, 17)


Unnamed: 0,AKA,AKE,AKH,AKY,ALC,AMN,BNZ,COC,COH,COO,COONH2,KEY,OH,SHH,cid,image_saved_as,ring
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13156,3_bromo_2_butanone_13156.png,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,62791,3_bromo_2_butanone_62791.png,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13142,3_bromo_2_butanone_13142.png,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12413224,3_bromo_2_butanone_12413224.png,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,123858546,3_bromo_2_butanone_123858546.png,0.0


In [3]:
train = image_df.sample(frac = 0.80, random_state=1) # sampling with out replacement is the default

print (train.shape)

train.head()

(1850, 17)


Unnamed: 0,AKA,AKE,AKH,AKY,ALC,AMN,BNZ,COC,COH,COO,COONH2,KEY,OH,SHH,cid,image_saved_as,ring
959,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59922317,2_aminopropane_59922317.png,0.0
47,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3822501,4_picolylamine_3822501.png,1.0
1123,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,15114725,3_trimethylsilyl_ethynylbenzaldehyde_15114725.png,1.0
553,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,57616908,lysine_57616908.png,0.0
996,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87701396,2_aminopropane_87701396.png,0.0


In [4]:
#Set a mask equal to the cid's which are in the training set that we made using pandas.sample 
mask = train['cid']

#for loop did not like the pandas.series, so I converted to a list 
mask = list(mask)


#this for loop checks the full dataframe (image_df) cid's against the ones in train
index_to_drop = []
for index, entry in enumerate(image_df['cid']):
#     print (entry)
    if entry in mask:
        index_to_drop.append(index)


# build the test dataframe by dropping the indicies which are not overlapping 
test = image_df.drop(labels=index_to_drop, axis=0)

print (test.shape)
print ('--'*25)
print (train.shape)
print ('--'*25)
print (image_df.shape)

test.head()

(463, 17)
--------------------------------------------------
(1850, 17)
--------------------------------------------------
(2313, 17)


Unnamed: 0,AKA,AKE,AKH,AKY,ALC,AMN,BNZ,COC,COH,COO,COONH2,KEY,OH,SHH,cid,image_saved_as,ring
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,123858546,3_bromo_2_butanone_123858546.png,0.0
7,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14692245,3_bromo_2_butanone_14692245.png,0.0
15,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12238842,4_picolylamine_12238842.png,1.0
20,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12425236,4_picolylamine_12425236.png,1.0
25,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87523068,4_picolylamine_87523068.png,1.0


## Creating the ImageDataGenerator

### Must define the labels and the image location first 

In [5]:
# We are going to look at a subset of features first 
features = ['AKA', 'AKE', 'ring', 'BNZ', 'image_saved_as']

#apply those feature constraints to the dataframes 
train_features = train[features]
test_features = test[features]

train_features.head()

Unnamed: 0,AKA,AKE,ring,BNZ,image_saved_as
959,1.0,0.0,0.0,0.0,2_aminopropane_59922317.png
47,1.0,0.0,1.0,1.0,4_picolylamine_3822501.png
1123,1.0,0.0,1.0,1.0,3_trimethylsilyl_ethynylbenzaldehyde_15114725.png
553,1.0,0.0,0.0,0.0,lysine_57616908.png
996,1.0,0.0,0.0,0.0,2_aminopropane_87701396.png


In [6]:
#Conver the float into into ints
train_features[["AKA", "AKE", "ring", "BNZ"]] = train_features[["AKA", "AKE", "ring", "BNZ"]].astype(int)
test_features[["AKA", "AKE", "ring", "BNZ"]] = test_features[["AKA", "AKE", "ring", "BNZ"]].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [7]:
train_features.head()

Unnamed: 0,AKA,AKE,ring,BNZ,image_saved_as
959,1,0,0,0,2_aminopropane_59922317.png
47,1,0,1,1,4_picolylamine_3822501.png
1123,1,0,1,1,3_trimethylsilyl_ethynylbenzaldehyde_15114725.png
553,1,0,0,0,lysine_57616908.png
996,1,0,0,0,2_aminopropane_87701396.png


In [8]:
test_features.head()

Unnamed: 0,AKA,AKE,ring,BNZ,image_saved_as
4,1,0,0,0,3_bromo_2_butanone_123858546.png
7,1,0,0,0,3_bromo_2_butanone_14692245.png
15,1,0,1,1,4_picolylamine_12238842.png
20,1,0,1,1,4_picolylamine_12425236.png
25,1,0,1,1,4_picolylamine_87523068.png


In [9]:
#setting up the label column 
train_features['label'] = train_features[train_features.columns[:-1]].apply(lambda x: ','.join(x.astype('str')), axis =1)

test_features['label'] = test_features[test_features.columns[:-1]].apply(lambda x: ','.join(x.astype('str')), axis =1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [10]:
train_features.head()

Unnamed: 0,AKA,AKE,ring,BNZ,image_saved_as,label
959,1,0,0,0,2_aminopropane_59922317.png,1000
47,1,0,1,1,4_picolylamine_3822501.png,1011
1123,1,0,1,1,3_trimethylsilyl_ethynylbenzaldehyde_15114725.png,1011
553,1,0,0,0,lysine_57616908.png,1000
996,1,0,0,0,2_aminopropane_87701396.png,1000


In [15]:
test_features.head()

Unnamed: 0,AKA,AKE,ring,BNZ,image_saved_as,label
4,1,0,0,0,3_bromo_2_butanone_123858546.png,1000
7,1,0,0,0,3_bromo_2_butanone_14692245.png,1000
15,1,0,1,1,4_picolylamine_12238842.png,1011
20,1,0,1,1,4_picolylamine_12425236.png,1011
25,1,0,1,1,4_picolylamine_87523068.png,1011


In [12]:
train_features.label[0]

'1,0,0,0'

KeyError: 0