# Model Inference Comparison & Validation Script

## Sample argument input

**`python mc&v.py --inference_score_dir --canonical_json --model_index_json --compare_video --compare_label` ** 

In [None]:
import csv, json, glob, os, pandas as pd, objdict, ast
from pandas.io.json import json_normalize

compare_video = 'Deadpool1'
compare_label = 'gunshot'
compareDF = pd.DataFrame(columns=['VideoId'])
modelCount = -1

This Model Comparison Script takes in a json file directory where each json file has been normalized in the format below:

## Inference Score Confidence Json
``` json
{
  "VideoId": "some-identifier.wav",
  "Label_Data": [
    {
      "label_0" "427",
      "labelConf_0": "0.4131"
    },
    {
      "label_1": "213",
      "labelConf_1": "0.3121"
    },
    {
      "label_2": "0",
      "labelConf_2": "0.2421"
    }
  ]
}
```

In [None]:
#Pass a directory of formatted Inference Json files
for jsonfile in glob.glob('NormalizedInferenceScores/*json'):
    jsonDF = pd.read_json(jsonfile)
    modelCount += 1
    print(str(jsonfile) +" has been indexed at: Model_"+ str(modelCount) )
    #Iterate through the rows and reformat based on key/value pairs
    for index, row in jsonDF.iterrows():
        
        #Check if VideoId currently exists in the master compare dataframe otherwise add it
        if(compareDF['VideoId'].str.contains(str(row['VideoId'])).any()):
            if(index == 0):
                print("Audio Inference file contained preexisting VideoIds, Updating compare dataframe with values from Model_" +str(modelCount))
            jsonRow = row['Label_Data']
            convertedRow = pd.DataFrame(columns=['VideoId'])
            indexWrite = compareDF[compareDF['VideoId']==row['VideoId']].index.values.astype(int)[0]
            #For all labels and confidence pairs, add as new column in the data frame
            for field in jsonRow:
                iterativeLabel = "Model"+str(modelCount)+"_"+field
                if (index == 0):
                    compareDF = compareDF.reindex(columns=[*compareDF.columns.tolist(), iterativeLabel], fill_value=0.0)
                compareDF.loc[indexWrite, iterativeLabel] = jsonRow[field]

        #Add VideoId to master compare dataframe with values   
        else:
            if (index == 0):
                print("New Audio Inference file detected with new videos, Adding new VideoIDs from Model_" +str(modelCount))
            jsonRow = row['Label_Data']
            convertedRow = pd.DataFrame(columns=['VideoId'])
            #For all labels and confidence pairs, add as new column in the data frame
            for field in jsonRow:
                #Create Columns for new model
                iterativeLabel = "Model"+str(modelCount)+"_"+field
                if (index == 0):
                    compareDF = compareDF.reindex(columns=[*compareDF.columns.tolist(), iterativeLabel], fill_value=0.0)  
                convertedRow.loc[0, iterativeLabel] = jsonRow[field]
            # Add Temp row to Compare Dataframe    
            convertedRow.loc[0,'VideoId'] = row['VideoId']
            compareDF = compareDF.append(convertedRow, ignore_index=True, sort=True)
            
compareDF['VideoId'] = compareDF['VideoId'].str.replace(r'.wav$', '')
compareDF = compareDF.set_index(['VideoId'])
    

In [None]:
compareDF

In [None]:
compareDF.loc[compareDF.index.isin(['deadpool1_01-00-20.000'])]

In [None]:
compareDF.to_csv('compareDF.csv')

## Canonical Video Validation Json

Next we use a canonical validation json file that stores an array of string labels associated with respective audio frames. In this example a frame is a 10 second interval and the videoId is reflective of the frame.

``` json
[
  {
    "movie": "Deadpool1",
    "data": [
      {
        "Label_Array": ["gunshot", "speech"],
        "VideoId": "deadpool1_00-07-50.000"
      },
      {
        "Label_Array": ["gunshot"],
        "VideoId": "deadpool1_00-13-00.000"
      },
      {
        "Label_Array": ["gunshot"],
        "VideoId": "deadpool1_00-11-00.000"
      }
    ]
  },
  {
    "movie": "Deadpool2",
    "data": [
      {
        "Label_Array": ["gunshot"],
        "VideoId": "deadpool2_00-02-00.000"
      },
      {
        "Label_Array": ["gunshot"],
        "VideoId": "deadpool2_00-03-20.000"
      },
      {
        "Label_Array": ["gunshot"],
        "VideoId": "deadpool2_00-04-20.000"
      },
      {
        "Label_Array": ["gunshot"],
        "VideoId": "deadpool2_00-33-10.000"
      }
    ]
  }
]

```

In [None]:
#Sample Canonical Validation File
jsonDf = pd.read_json("canonicalTest1.json")

In [None]:
jsonDf

In [None]:
movieDf = jsonDf[jsonDf['movie'].str.contains(compare_video)]

In [None]:
movieDf

In [None]:
for index, row in movieDf.iterrows():
    #normalizeDF = jsonDf['data'][index]
    normalizeDF = movieDf['data'][index]
json_normalize(normalizeDF)
canonicalDf = (pd.concat({i: json_normalize(normalizeDF) for i, normalizeDF in movieDf.pop('data').items()})
         .reset_index(level=1, drop=True)
         .join(movieDf)
         .reset_index(drop=True))
canonicalDf

## Model Index Json

Finally we use a model index json that stores each respectives models index labels based on a knowledge graph for scoring the accuracy of the model.
``` json
[
  {
    "model": "AudioSet",
    "model_index": [
      {
        "model_label_str": "gunshot",
        "model_label_vals": ["427", "428", "429", "430"]
      },
      {
        "model_label_str": "explosion",
        "model_label_vals": ["426"]
      }
    ]
  },
  {
    "model": "KerasGunshot_CustomModel1",
    "model_index": [
      {
        "model_label_str": "gunshot",
        "model_label_vals": ["gunshot"]
      },
      {
        "model_label_str": "other",
        "model_label_vals": ["other"]
      },
      {
        "model_label_str": "pred_label",
        "model_label_vals": ["pred_label"]
      }
    ]
  }
]

```

In [None]:
modelIndexDf = pd.read_json("model_index.json")
modelIndexDf

In [None]:
for index, row in modelIndexDf.iterrows():
    #normalizeDF = jsonDf['data'][index]
    normalizeDF2 = modelIndexDf['model_index'][index]

In [None]:
modelMapDf = (pd.concat({i: json_normalize(normalizeDF2) for i, normalizeDF2 in modelIndexDf.pop('model_index').items()})
         .reset_index(level=1, drop=True)
         .join(modelIndexDf)
         .reset_index(drop=True))
modelMapDf

## Future features:

- Move model index to directory base
- Persisted Dataframes
- Validation Cells for Gunshot (Inputs for Validation)
- Migrate to a script