# [KIRIN-728](https://jira.ssc.lmco.com:7443/browse/KIRIN-728): Distribution Plotting of Features

---

Create distribution plots of features. Based on results from ~~[KIRIN-716](https://jira.ssc.lmco.com:7443/browse/KIRIN-716)~~, there seems to be some files that probably have some odd behaviors in their features. Also might want to split between CoA vs no CoA

## Extract Data

In [1]:
import glob
data_directory = glob.glob("/domino/datasets/unified-hdf-feature-data/*.hdf5")

## Organize data
from "/unified-hdf-feature-data" directory, create 4 batchs of 1000 files

In [2]:
# import pandas as pd

#Change these.................................................
batch_size = 5000 # 1000 takes too long and domino sucks and breaks
file_limit = 10000

print("Number of Files Loaded:")
batch_number = 1 # ex. batch1
start = 0
current_position = batch_size
batchs = []
for batch in data_directory:
    globals()[f"batch{batch_number}"] = [] # ex. batch1[]
    
    if current_position > file_limit:
        break
    
    # add files to batch
    [globals()[f"batch{batch_number}"].append(file) for file in data_directory[start:current_position]] # ex. batch1[0:1000]

    # add batchs to list
    batchs.append(globals()[f"batch{batch_number}"])# ex. batchs[batch1, batch2]
    
    batch_number+=1
    start+=batch_size
    print(current_position, end=" ")
    current_position+=batch_size

Number of Files Loaded:
5000 10000 

## Clean/Prepare Data
1. Combine files in batchs[batch1] into 1 DF
2. Add "File" column for the range 
3. Add all DFs to a list for plotting
4. Combine into single DF

In [None]:
import pandas as pd

print("Batchs Loaded:")
start = 0
all_batchs = []
for batch in batchs:
     # combine files in batchs[batch1] into 1 DF
    df = pd.concat((map(pd.read_hdf, batch)))
    
    # add "File" column for the range 
    df.insert(0, "Files", f"{start}-{start + batch_size}", True ) # ex. [Files][0-1000]
    start+=batch_size
    
    # add all DFs to a list for plotting
    all_batchs.append(df)
    
    print(f"{start - batch_size}-{start}", end=" ") # reverse because batch_size has been incremented
    
# combine into single DF
df = pd.concat(all_batchs)
df = pd.DataFrame(df.reset_index(drop=True))

Batchs Loaded:


## Plotting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Plot Settings
plt.rcParams["figure.figsize"]=12,8
sns.set(style="whitegrid")
path="/mnt/kirin_ml/unified_nn_training/feature_plot_graphs"

print("Features loaded:")
columns = list(df)
for column in columns[1:-1]: # exclude [Files] & [coa]
    sns.kdeplot(data=df, x=column, hue="Files", bw_adjust=.9, cut=0, palette="tab10", linewidth=3, fill=True, alpha=.1, legend=True)
    
    # save plot
    name = f'{column}.png'
    destination = os.path.join(path, name)
    plt.savefig(destination)
        
    print(f"{column} ", end="")
#     plt.show(block=False) # display inline

### TODO: Make seperate DF and distribution plots for no-COA(coa=0)
- Remove coa=0 rows from column then rerun!

In [None]:
# Identify Files with Unique COAs
# [print(file, pd.read_hdf(file)["coa"].unique()) for file in files[0:11]]