In [4]:
import pandas as pd
import numpy as np
import glob
import matplotlib
import matplotlib.pyplot as plt

In [5]:
#Creates a list of the names of given datasets
folders = ["test"] 

In [6]:
file_counter = 0
#Creates an empty list that will hold data from each dataset
sets_list = []
#Iterates through dataset names to read in csvs from each dataset folder
for dataset in folders:
    #Creates an empty list to hold each image from each dataset
    matrix_list = []
    #Iterates through all csv files contained in each dataset's folder
    for file in glob.glob(str(dataset) + '/' + '*.csv'):
        file_counter += 1
        #Reads in the csv
        data = pd.read_csv(file, header = None)
        #Makes the image data into a dataframe
        df = pd.DataFrame(data)
        #Drops the first row of the dataframe that contains labels
        df = df.drop(labels = 0, axis = 0)
        #Adds the image dataframe to the list of images
        matrix_list.append(df)
    #Adds the list of images from each dataset to a list containing all datasets
    sets_list.append(matrix_list)
print(file_counter)
#Sets list is a list of lists of dataframes
#Sets list contains a list for each dataset which contains dataframes (each dataframe is one image)

270


In [7]:
#Sets a percentile value to dictate what is defined as a peak (in this case, a peak is > 85 percentile)
percentile = 85
all_peaks = []
for dataset in sets_list:
    num_peaks = []
    for image in dataset:
        #Casts data from each image as a float to deal with type errors
        image_np = image.astype(float)    
        #Calculates a percentile value for the image based on percentile specified above
        percentile_val = np.nanpercentile(image_np, percentile)
        
        #Creates an empty dataframe and then fills it with the calculated value of the image's percentile
        percentile_df = pd.DataFrame(np.zeros([len(image_np), image_np.shape[1]]))
        percentile_df = percentile_df.replace(0,percentile_val)
    
        #Find the difference between each value in the image and the percentile value for the image
        diff = image_np - percentile_df
        
        #Counts the number of values in each image that are greater than the  percentile value
        counter = 0
        for rowIndex, row in diff.iterrows(): #iterate over rows
            for columnIndex, value in row.items():
                if value > 0:
                    counter += 1

        num_peaks.append(counter)
    all_peaks.append(num_peaks)

In [13]:
#csv_data = all_peaks[0] + all_peaks[1] + all_peaks[2] + all_peaks[3] + all_peaks[4] + all_peaks[5]
csv_data = all_peaks[0]
df = pd.DataFrame(csv_data, columns = ["num_peaks"])
df.index = np.arange(1, len(df) + 1)
df.to_csv("test_num_peaks.csv")

In [10]:
 #Creates a list so that we can collect max, min, mean gradients for each dataset (as well as stdev of gradients)
stats = [np.max, np.min, np.mean, np.std]
op_gradients = []
#Iterates through every operation we want to perform on the gradients
for op in stats:
    gradient = []
    #Iterates through the 4 datasets
    for dataset in sets_list:
        image_grads = []
         #Iterates through each image contained within each of the datasets
        for image in dataset:
            col = []
            #Iterates through each column of pixel values within each image
            for column in image:
                #Converts to float to avoid type errors
                column_np = image[column].astype(float)
                #If the last column is reached, the gradient is written as zero to avoid comparing to out of range columns
                if column == (image.shape[1] - 1):
                    col.append(0)
                #Else, if the column is not the last in the image
                else:
                     #compares the current column to the next column in the image
                    next_col = image[column + 1].astype(float)
                    #Takes gradients between the current and next columns (over both axes)
                    # And performs one of the statistical operations for each column
                    col.append(op(np.abs(np.gradient([column_np, next_col]))))
            #Makes a list of gradient value for each image
            image_grads.append(op(col))
        #Adds the list of gradient values from each image in each dataset into a list that contains all datasets gradients
        gradient.append(image_grads)
    #Adds all the dataset values to a list that contains one entry for each operation that was performed
    op_gradients.append(gradient)

In [12]:
#csv_max = op_gradients[0][0] + op_gradients[0][1] + op_gradients[0][2] + op_gradients[0][3] + op_gradients[0][4] + op_gradients[0][5]
#csv_min = op_gradients[1][0] + op_gradients[1][1] + op_gradients[1][2] + op_gradients[1][3] + op_gradients[1][4] + op_gradients[1][5]
#csv_mean = op_gradients[2][0] + op_gradients[2][1] + op_gradients[2][2] + op_gradients[2][3] + op_gradients[2][4] + op_gradients[2][5]
#csv_std = op_gradients[3][0] + op_gradients[3][1] + op_gradients[3][2] + op_gradients[3][3] + op_gradients[3][4] + op_gradients[3][5]
csv_max = op_gradients[0][0]
csv_min = op_gradients[1][0]
csv_mean = op_gradients[2][0]
csv_std = op_gradients[3][0]

df_max = pd.DataFrame(csv_max, columns = ["gradient_max"])
df_min = pd.DataFrame(csv_min, columns = ["gradient_min"])
df_mean = pd.DataFrame(csv_mean, columns = ["gradient_mean"])
df_std = pd.DataFrame(csv_std, columns = ["gradient_stdev"])

gradient_df = pd.concat([df_max, df_min, df_mean, df_std], axis = 1)
gradient_df.index = np.arange(1, len(gradient_df) + 1)

gradient_df.to_csv("test_gradient.csv")