In [None]:
import pandas as pd
import numpy as np
import sklearn
import tensorflow as tf
import re
from os import listdir
from os.path import isfile, join

from rainforest_functions import vectorize_categories
from rainforest_functions import softmax_mine


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],\
                        strides=[1, 2, 2, 1], padding='SAME')

In [None]:
mypath = '/Users/AnaSolaguren-Beascoa/Pictures/train-jpg/'

In [None]:
#Read the train data before adding the rotated images


#read the image names and their categories
image_name_df = pd.read_csv('train_v2.csv')
categories = image_name_df['tags'].str.split(' ', expand=True).stack().unique()

#read the green pixels for each image          
pixel_df = pd.read_csv(mypath+'pixels_G.csv')
pixel_df = pixel_df.rename(columns = {str(pixel_df.shape[1]-1):'image_name'})

#merge the categories and the pixels
result = pd.merge(pixel_df, image_name_df, on='image_name')
del(pixel_df)

#THESE ARE THE PIXELS FOR THE TRAIN DATA
X_G = result.drop(['tags','image_name'],axis=1)
#THESE ARE THE CATEGORIES FOR THE TRAIN DATA
y_onefile = result['tags']
del(result)

#vectorise the categories
y_G = vectorize_categories(categories, y_onefile)
y_G.columns = categories    
del(y_onefile)


print('END')     


In [None]:
#READ THE TEST DATA

#read the test pixels
pixel_df_test = pd.read_csv(mypath+'pixels_test.csv')
pixel_df_test = pixel_df_test.rename(columns = {str(pixel_df_test.shape[1]-1):'image_name'})

#THESE ARE THE PIXELS FOR THE TEST DATA
X_G_test = pixel_df_test.drop(['image_name'],axis=1)
X_G_test = pd.DataFrame(scaler.fit_transform(X_G_test))

#THESE ARE THE NAMES OF THE FILES FOR THE TEST DATA
test_image_name = pixel_df_test['image_name']
del(pixel_df_test)
  

print('END')     


In [None]:
#Read the rotated files which symmetrise our data
rotated_files = [f for f in listdir(mypath+'all_rotated/') if isfile(join(mypath+'all_rotated/', f))]
num_files = len(rotated_files)

#Final DataFrame with the corresponding categories for the test data
y_final_df = pd.DataFrame(columns=list(categories))

for filenr in range(len(rotated_files)):

    #read the rotated pictures for green color      
    if rotated_files[filenr].endswith('G.csv'):
        
        
        pixel_df= pd.read_csv(mypath+'all_rotated/'+rotated_files[filenr])
        X_onefile = pixel_df.drop(categories.tolist(),axis=1)
        y_vectorised_one = pixel_df[categories.tolist()]
        del(pixel_df)
    
        X_G_2 = pd.concat([X_G, X_onefile], ignore_index=True)
        y_G_2 = pd.concat([y_G, y_vectorised_one], ignore_index=True)
        

        
        print(rotated_files[filenr] + 'is read')
        
        #read the category of the file
        for cat_type in categories:
            if cat_type in rotated_files[filenr]:
                current_category = cat_type
                break
        print('category is ' + current_category)       
        
        X_data = X_G_2.copy()
        y_pre = y_G_2[cat_type]
        del(X_G_2)
        del(y_G_2)
        
        
        #create two columns for the output, true contains the true value      
        y_data = pd.DataFrame(columns=['True','False'], index=range(y_pre.shape[0]))
        y_data['True'] = y_pre.astype(int)
        y_data.loc[y_data['True'] == 1, 'False'] = 0
        y_data.loc[y_data['True'] == 0, 'False'] = 1
        
        
        
           '''Divide the data into test and training
        This step is done for each different category
        This step is just needed when testing how good our model is
        Can be set to test_size=0 when calculating the final result'''

        #split the data
        X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0)

        #scale the data
        X_train = pd.DataFrame(scaler.fit_transform(X_train))
        X_test = pd.DataFrame(scaler.fit_transform(X_test))

        y_train = y_train.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)
        

        #make a list of all the categories by order of appearence
        category_sess = []
        category_sess.append(cat_type)
        
        

        #Define our neural network
        lenx = X_data.shape[1]
        leny = y_data.shape[1]

        x = tf.placeholder(tf.float32, shape=[None, lenx])
        y_ = tf.placeholder(tf.float32, shape=[None, leny])

        x_image = tf.reshape(x, [-1, int(np.sqrt(lenx)), int(np.sqrt(lenx)), 1])
        x_image = tf.cast(x_image, tf.float32)
        #layer 1

        W_conv1 = weight_variable([5, 5, 1, 32])
        b_conv1 = bias_variable([32])

        h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
        h_pool1 = max_pool_2x2(h_conv1)


        #layer 2
        W_conv2 = weight_variable([20, 20, 32, 64])
        b_conv2 = bias_variable([64])

        h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
        h_pool2 = max_pool_2x2(h_conv2)

        #layer 3
        W_fc1 = weight_variable([int(np.sqrt(lenx)/4)*int(np.sqrt(lenx)/4)*64, 1024])
        b_fc1 = bias_variable([1024])

        h_pool2_flat = tf.reshape(h_pool2, [-1, int(np.sqrt(lenx)/4)*int(np.sqrt(lenx)/4)*64])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

        #layer 4
        keep_prob = tf.placeholder(tf.float32)
        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

        #layer 5
        W_fc2 = weight_variable([1024, leny])
        b_fc2 = bias_variable([leny])

        y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

        #output
        y = y_conv


        
               
        #TRAIN OUR NEURAL NETWORK     

        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))

        learning_rate = 0.000003
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)
        
        correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        batchsize = 100
        steps = int(X_train.shape[0]/batchsize)
     
        
        #make a list which contains each session
        sess=tf.Session()
        sess.run(tf.global_variables_initializer())


        print('Now the NN is being trained')
        
        i = 0
        for k in range(steps):
            batch_xs = X_train.iloc[i:i+batchsize,:].as_matrix().astype(np.float32)
            batch_ys = y_train.iloc[i:i+batchsize].as_matrix().astype(np.float32)
            if k % 10 == 0:
                train_accuracy = sess.run(accuracy,feed_dict={x: batch_xs, y_: batch_ys, keep_prob: 1.0})
                print('step %d, training accuracy %g' % (k, train_accuracy))
                learning_rate = learning_rate*0.85
                
            sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys, keep_prob: 0.5})
            
            i+=batchsize
            
            


               
        #CALCULATE THE RESULT FOR THE TEST DATA
          
        y_final=[]
        
        for p in range(len(X_G_test)-1):
            answer = sess.run(y_conv, feed_dict={x: X_G_test[p:p+1], keep_prob: 1})
            y_pred_test = sess.run(tf.nn.softmax(answer,dim=-1,name=None)).astype(int)
            y_final.append(y_pred_test[0][0].astype(int))
        
            
        
        y_final_df[cat_type] = y_final
        
   

        #next file
   