In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib
import matplotlib.pyplot as plt
import statistics

# Reads in Datasets from CSVs

In [2]:
#Creates a list of the names of given datasets
folders = ["high_train", "med_train", "low_train", "high_validate", "med_validate", "low_validate"] 

In [3]:
#Creates an empty list that will hold data from each dataset
sets_list = []
#Iterates through dataset names to read in csvs from each dataset folder
for dataset in folders:
    #Creates an empty list to hold each image from each dataset
    matrix_list = []
    #Iterates through all csv files contained in each dataset's folder
    for file in glob.glob(str(dataset) + '/' + '*.csv'):
        #Reads in the csv
        data = pd.read_csv(file, header = None)
        #Makes the image data into a dataframe
        df = pd.DataFrame(data)
        #Drops the first row of the dataframe that contains labels
        df = df.drop(labels = 0, axis = 0)
        #Adds the image dataframe to the list of images
        matrix_list.append(df)
    #Adds the list of images from each dataset to a list containing all datasets
    sets_list.append(matrix_list)

#Sets list is a list of lists of dataframes
#Sets list contains a list for each dataset which contains dataframes (each dataframe is one image)

# Calculates Basic Statistical Properties for Data

In [4]:
#Creates empty lists to contain stat values for all dataset
all_mins = []
all_maxs = []
all_ranges = []
all_means = []
#Iterates through each dataset
for dataset in sets_list:
    mins = []
    maxs = []
    ranges = []
    means = []
    #Iterates through each image from each dataset
    for image in dataset:
        #Calculates minimum, maximum, mean, and range for each dataset
        mins.append(image.max())
        maxs.append(image.min())
        ranges.append(abs(image.max() - image.min()))
        means.append(image.mean())
    #Adds each value back into an overall list to consolidate all dataframes
    all_mins.append(mins)
    all_maxs.append(maxs)
    all_ranges.append(ranges)
    all_means.append(means)

# Calculates Number of Peaks in Each Image Using a Percentile

In [5]:
#Sets a percentile value to dictate what is defined as a peak (in this case, a peak is > 85 percentile)
percentile = 85
all_peaks = []
for dataset in sets_list:
    num_peaks = []
    for image in dataset:
        #Casts data from each image as a float to deal with type errors
        image_np = image.astype(float)    
        #Calculates a percentile value for the image based on percentile specified above
        percentile_val = np.nanpercentile(image_np, percentile)
        
        #Creates an empty dataframe and then fills it with the calculated value of the image's percentile
        percentile_df = pd.DataFrame(np.zeros([len(image_np), image_np.shape[1]]))
        percentile_df = percentile_df.replace(0,percentile_val)
    
        #Find the difference between each value in the image and the percentile value for the image
        diff = image_np - percentile_df
        
        #Counts the number of values in each image that are greater than the  percentile value
        counter = 0
        for rowIndex, row in diff.iterrows(): #iterate over rows
            for columnIndex, value in row.items():
                if value > 0:
                    counter += 1

        num_peaks.append(counter)
    all_peaks.append(num_peaks)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


# Calculates Gradient Behavior in Each Image

In [6]:
#Creates a list so that we can collect max, min, mean gradients for each dataset (as well as stdev of gradients)
stats = [np.max, np.min, np.mean, np.std]
op_gradients = []
#Iterates through every operation we want to perform on the gradients
for op in stats:
    gradient = []
    #Iterates through the 4 datasets
    for dataset in sets_list:
        image_grads = []
         #Iterates through each image contained within each of the datasets
        for image in dataset:
            col = []
            #Iterates through each column of pixel values within each image
            for column in image:
                #Converts to float to avoid type errors
                column_np = image[column].astype(float)
                #If the last column is reached, the gradient is written as zero to avoid comparing to out of range columns
                if column == (image.shape[1] - 1):
                    col.append(0)
                #Else, if the column is not the last in the image
                else:
                     #compares the current column to the next column in the image
                    next_col = image[column + 1].astype(float)
                    #Takes gradients between the current and next columns (over both axes)
                    # And performs one of the statistical operations for each column
                    col.append(op(np.gradient([column_np, next_col])))
            #Makes a list of gradient value for each image
            image_grads.append(op(col))
        #Adds the list of gradient values from each image in each dataset into a list that contains all datasets gradients
        gradient.append(image)
    #Adds all the dataset values to a list that contains one entry for each operation that was performed
    op_gradients.append(gradient)