# Installations

In [None]:
!pip install bokeh
!pip install ipywidgets
!pip install matplotlib
!pip install ggplot
!pip install google-cloud-storage



# General Imports

## Run Once to load releavnt packages for data manipulation             

In [None]:
# Google - Clab + Storage
from google.colab import drive
from google.colab import files
from google.cloud import storage
# Dynamic widgets and UI
from ipywidgets import interact, interactive, fixed, interact_manual, FileUpload
import ipywidgets as widgets
from IPython.display import display
# Native Python packages 
import os 
import io
import time
import json
# Data manipulation and calculation packages
import pandas as pd
import numpy as np
### Visualization
# Matplotlib
import matplotlib.pyplot as plt
# Bokeh
import bokeh.plotting
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, CDSView, GroupFilter
from bokeh.io import output_notebook
# Call once to configure Bokeh to display plots inline in the notebook.
output_notebook()

# this will allow the notebook to reload/refresh automatically within the runtime
%reload_ext autoreload
%autoreload 2

# General Functions helpers


In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# Based on - https://gist.github.com/sainathadapa/eb3303975196d15c73bac5b92d8a210f
def anti_join(x, y, left_on, right_on):
    """Return rows in x which are not present in y"""
    ans = pd.merge(left=x, right=y, how='left', indicator=True, left_on=left_on, right_on=right_on)
    ans = ans.loc[ans._merge == 'left_only', :].drop(columns='_merge')
    return ans


def anti_join_all_cols(x, y):
    """Return rows in x which are not present in y"""
    assert set(x.columns.values) == set(y.columns.values)
    return anti_join(x, y, x.columns.tolist())

# Plotting fucntion

In [None]:
def create_plots(df, groupping_col_name, y_name, x_name, grid_features):
  print("Groupping by - ",groupping_col_name)
  print("y value by - ", y_name)
  print("x value by - ",x_name)
  print("feature matrix - ",grid_features)
  groups = df.groupby(by=[groupping_col_name])

  plots = []

  for group in groups:
    g = group[-1]
    _grid_features = grid_features.copy()
    group_col = g[groupping_col_name]
    if len(group_col) == 0:
      continue
    group_id = groupping_col_name+"="+str(group_col.iloc[0])
    for feature_name in _grid_features:
      feature_values = g[feature_name].unique()
      _grid_features.remove(feature_name)
      for f_val in feature_values:
        for other_feature_name in _grid_features:
          other_feature_values = g[other_feature_name].unique()
          for of_val in other_feature_values:
            title = group_id + "_" + feature_name + "=" + str(f_val) + "/" + other_feature_name + "=" + str(of_val)
            selector = (g[feature_name] == f_val) & (g[other_feature_name] == of_val)
            gg = g[selector]
            y = gg.groupby(by=[x_name])[y_name].mean().reset_index()
            raw_data_source = ColumnDataSource(y)
            p = figure(
              plot_width=400, plot_height=400,
              title=title,
              x_axis_label=x_name,
              y_axis_label=y_name
            )
            p.line(x=x_name, y=y_name, source=raw_data_source)
            plots.append(p)

  return plots

In [None]:
# General Plot of mean all data
def plot_general_avg(df, y_name, x_name):
  groupd_avg = df.groupby(by=[x_name])[y_name].mean().reset_index()
  p = figure(
    plot_width=400, plot_height=400,
    title="Avarage {}".format(y_name)
  )
  p.line(x=x_name, y=y_name, source=ColumnDataSource(groupd_avg))
  return p

In [None]:
def plot_general_avg_grid(df, y_name, x_name, grid_features):
  plots = []
  # groupd_avg = df.groupby(by=[x_name])[y_name].mean().reset_index()
  _grid_features = grid_features.copy()
  # group_id = groupping_col_name+"="+str(g[groupping_col_name].iloc[0])
  for feature_name in _grid_features:
    feature_values = df[feature_name].unique()
    _grid_features.remove(feature_name)
    for f_val in feature_values:
      for other_feature_name in _grid_features:
        other_feature_values = df[other_feature_name].unique()
        for of_val in other_feature_values:
          title = feature_name + "=" + str(f_val) + "/" + other_feature_name + "=" + str(of_val)
          selector = (df[feature_name] == f_val) & (df[other_feature_name] == of_val)
          gg = df[selector]
          y = gg.groupby(by=[x_name])[y_name].mean().reset_index()
          raw_data_source = ColumnDataSource(y)
          p = figure(
            plot_width=400, plot_height=400,
            title=title,
            x_axis_label=x_name,
            y_axis_label=y_name
          )
          p.line(x=x_name, y=y_name, source=raw_data_source)
          plots.append(p)
  return plots

# Please enter your name, possible experiment name as well

In [None]:
researcher_name = "Gal Nitsan" #@param {type:"string"}
experiment_name = "old adults data" #@param {type:"string"}

# File Uploads
## Please note we currently support only CSV files.
##    If your'e using Excel or other file formats please convert to CSV
###  Excel to CSV - https://knowledgebase.constantcontact.com/articles/KnowledgeBase/6409-saving-an-excel-file-as-a-csv-file?lang=en_US
### EDF to CSV - https://emotiv.gitbook.io/emotivpro/convert_edf_to_csv

# 1) Choose Primary file to upload - Only one file!

## If you alerady uploaded the file you can select it 


In [None]:
def choose_filename(filename):
  return filename
primary_file_name_dropdown = widgets.Dropdown(
    options=os.listdir(),
    description='File name:',
)

interact(choose_filename, filename=primary_file_name_dropdown)

interactive(children=(Dropdown(description='File name:', options=('.config', 'old_adults_2_2_20.xls.csv', 'cor…

<function __main__.choose_filename>

In [None]:
dfs = {}
data_frame_names = []
file_name = primary_file_name_dropdown.value
data_frame_names.append('primary')
dfs['primary'] = pd.read_csv(file_name)

## If you have not uploaded the file already, please upload it
## If you used the 2 boxes above make sure to SKIP these 2 boxes, or else you will override your data

In [None]:
uploaded = files.upload()
assert len(uploaded.items()) == 1

Saving old_adults_2_2_20.xls.csv to old_adults_2_2_20.xls.csv


In [None]:
dfs = {}
data_frame_names = []
for file_name, byte_file in uploaded.items():
  data_frame_names.append('primary')
  dfs['primary'] = pd.read_csv(io.StringIO(byte_file.decode("utf-8")))

# 2) Choose Other files to upload, files we wish to merge to the primary file, like demographic data, metadata on the trial and others

## If you alerady uploaded the file you can select it 

In [None]:
def choose_filename(filename):
  return filename
secondary_files_name_dropdown = widgets.SelectMultiple(
    options=os.listdir(),
    description='File name:',
)

interact(choose_filename, filename=secondary_files_name_dropdown)

interactive(children=(SelectMultiple(description='File name:', options=('.config', 'old_adults_2_2_20.xls.csv'…

<function __main__.choose_filename>

In [None]:
scondary_file_names = list(secondary_files_name_dropdown.value)
for file_name in scondary_file_names:
  data_frame_names.append(file_name)
  dfs[file_name] = pd.read_csv(file_name)

## If you have not uploaded the file already, please upload it
## If you used the 2 boxes above please skip these 2

In [None]:
merged_uploaded = files.upload()

Saving corrected_span_groups.csv to corrected_span_groups.csv


In [None]:
for file_name, byte_file in merged_uploaded.items():
  data_frame_names.append(file_name)
  dfs[file_name] = pd.read_csv(io.StringIO(byte_file.decode("utf-8")))

## Data Sumarization

In [None]:
for key, df in dfs.items():
  print("###########################################################")
  print("The {} Data set, contains the follwoing data and data types".format(key))
  print("###########################################################")
  df.info()

###########################################################
The primary Data set, contains the follwoing data and data types
###########################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344666 entries, 0 to 344665
Data columns (total 34 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   RECORDING_SESSION_LABEL          344666 non-null  int64  
 1   TRIAL_LABEL                      344666 non-null  object 
 2   TRIAL_INDEX                      344666 non-null  int64  
 3   BIN_INDEX                        344666 non-null  int64  
 4   BIN_DURATION                     344666 non-null  int64  
 5   BIN_START_TIME                   344666 non-null  int64  
 6   BIN_SAMPLE_COUNT                 344666 non-null  int64  
 7   IA_1_ID                          344666 non-null  object 
 8   IA_2_ID                          344666 non-null  object 
 9   IA_3_I

# Merging Extra data from external files - (for example memory span, age, etc)

### Please note that if you wish to look at the raw data before adding more from external sources, all you need to do is just skip the following box and move to the plotting part

### Please note, it is recommended to read a about JOIN and ANTI_JOIN opparations in SQL and Python Pandas to fully understand what we are doing here, although not mandatory. We would try and make the process as simple as possible


## 1) For each file we wish to merge, select the columns you wish to merge by with the primary file, these columns will be used as unique key and the recommendation will be the User Id/Session Label/Subject and Trial_Id/Trial Label.

In [None]:
def choose_colums(colums):
  return colums

# Widget that will be shwon at the bottom, after running the box
external_merge_columns_dropdown = []
for i in range(1,len(data_frame_names)):
  w1 = widgets.SelectMultiple(
      options=dfs[data_frame_names[i]].columns,
      description=data_frame_names[i]
  )
  w2 = widgets.SelectMultiple(
      options=dfs[data_frame_names[0]].columns,
      description=data_frame_names[0]
  )
  external_merge_columns_dropdown.append(widgets.HBox([w1,w2]))
  
columns_map = widgets.VBox(external_merge_columns_dropdown)
columns_map

VBox(children=(HBox(children=(SelectMultiple(description='corrected_span_groups.csv', options=('Participant #'…

## 2) Merging is done automatically

In [None]:
merged_df = dfs[data_frame_names[0]].copy()
for i in range(1,len(data_frame_names)):
  child = columns_map.children[i-1]
  left = list(child.children[1].value)
  right = list(child.children[0].value)
  merged_df = merged_df.merge(dfs[data_frame_names[i]], left_on=left, right_on=right, how='inner')


# 3) Summary of merged data

### General info


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 344666 entries, 0 to 344665
Data columns (total 37 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   RECORDING_SESSION_LABEL          344666 non-null  int64  
 1   TRIAL_LABEL                      344666 non-null  object 
 2   TRIAL_INDEX                      344666 non-null  int64  
 3   BIN_INDEX                        344666 non-null  int64  
 4   BIN_DURATION                     344666 non-null  int64  
 5   BIN_START_TIME                   344666 non-null  int64  
 6   BIN_SAMPLE_COUNT                 344666 non-null  int64  
 7   IA_1_ID                          344666 non-null  object 
 8   IA_2_ID                          344666 non-null  object 
 9   IA_3_ID                          344666 non-null  object 
 10  IA_4_ID                          344666 non-null  object 
 11  IA_0_ID                          344666 non-null  object 
 12  AV


### Categorical Data

In [None]:
merged_df.describe(include=['object'])

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


### Numerical Data

In [None]:
merged_df.describe()

Unnamed: 0,RECORDING_SESSION_LABEL,TRIAL_INDEX,BIN_INDEX,BIN_DURATION,BIN_START_TIME,BIN_SAMPLE_COUNT,AVERAGE_IA_1_SAMPLE_COUNT_%,AVERAGE_IA_2_SAMPLE_COUNT_%,AVERAGE_IA_3_SAMPLE_COUNT_%,AVERAGE_IA_4_SAMPLE_COUNT_%,AVERAGE_IA_0_SAMPLE_COUNT_%,AVERAGE_EXCLUDED_SAMPLE_COUNT_%,AVERAGE_BLINK_SAMPLE_COUNT_%,VARIABLE,load,noise_level,order,type,withha_onset,withoutha_onset,Participant #,Farward WAIS scoring,lowspan_1_highspan_2
count,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0
mean,133.917703,34.486895,86.905839,20.0,1738.116786,9.999646,0.343277,0.059222,0.066202,0.054963,0.280444,0.000191,0.198479,3.940737,2.501549,4.0,390.897878,15.392473,456.276975,518.216111,133.917703,9.448449,1.517333
std,18.228435,19.627711,50.478002,0.0,1009.560048,0.046771,0.471867,0.23303,0.245601,0.224845,0.441432,0.01354,0.393358,1.027321,1.500001,0.0,77.55644,3.123467,488.419193,553.860426,18.228435,1.714253,0.4997
min,102.0,1.0,0.0,20.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,4.0,273.0,9.0,0.0,0.0,102.0,5.0,1.0
25%,120.0,17.0,43.0,20.0,860.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,4.0,327.0,13.0,0.0,0.0,120.0,8.0,1.0
50%,134.0,34.0,87.0,20.0,1740.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,4.0,378.0,17.0,0.0,0.0,134.0,10.0,2.0
75%,149.0,51.0,131.0,20.0,2620.0,10.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,4.0,4.0,455.0,18.0,953.0,1075.0,149.0,11.0,2.0
max,162.0,68.0,174.0,20.0,3480.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,4.0,544.0,18.0,1142.0,1271.0,162.0,13.0,2.0


# Droping unwanted colums
## Here we suggest removing uneeded columns from our primary merged file, this could help both better performance and understanding of the data 


## 1) Choose unwanted colums

In [None]:
def choose_colums(colums):
  return colums
# Widget that will be shwon at the bottom, after running the box
colums_dropdown = widgets.SelectMultiple(
    options=merged_df.columns,
    description='Select colums to drop:',
)

unwanted_checkbox = widgets.Checkbox(
    value=True,
    description='Drop by unwanted'
)
interact(choose_colums, colums=colums_dropdown)
unwanted_checkbox

## 2) Dropping the columns

In [None]:
colums_to_drop = list(colums_dropdown.value)
merged_df = merged_df.drop(columns=colums_to_drop)

# Flattening 
for col in merged_df.columns:
  if merged_df[col].dtype == object:
    merged_df[col] = merged_df[col].astype('category')


## 3) Printing Summary

In [None]:
merged_df.describe(include=['category'])

Unnamed: 0,TRIAL_LABEL,IA_1_ID,IA_2_ID,IA_3_ID,IA_4_ID,IA_0_ID,condition,critical,digit_span_list
count,344666,344666,344666,344666,344666,344666,344666,344666,344666
unique,68,1,1,1,1,1,4,2,102
top,Trial: 9,1: Target,2: Filler_1,3: Distractor,4: Filler_2,0: NULL,0,n,"[7.wav, 1.wav, 1.wav, 1.wav, 1.wav, 1.wav, 1.w..."
freq,5075,344666,344666,344666,344666,344666,180950,182700,15212


#### Numerical Data

In [None]:
merged_df.describe()

Unnamed: 0,RECORDING_SESSION_LABEL,TRIAL_INDEX,BIN_INDEX,BIN_START_TIME,AVERAGE_IA_1_SAMPLE_COUNT_%,AVERAGE_IA_2_SAMPLE_COUNT_%,AVERAGE_IA_3_SAMPLE_COUNT_%,AVERAGE_IA_4_SAMPLE_COUNT_%,AVERAGE_IA_0_SAMPLE_COUNT_%,load,order,type,Farward WAIS scoring,lowspan_1_highspan_2
count,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0
mean,133.917703,34.486895,86.905839,1738.116786,0.343277,0.059222,0.066202,0.054963,0.280444,2.501549,390.897878,15.392473,9.448449,1.517333
std,18.228435,19.627711,50.478002,1009.560048,0.471867,0.23303,0.245601,0.224845,0.441432,1.500001,77.55644,3.123467,1.714253,0.4997
min,102.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,273.0,9.0,5.0,1.0
25%,120.0,17.0,43.0,860.0,0.0,0.0,0.0,0.0,0.0,1.0,327.0,13.0,8.0,1.0
50%,134.0,34.0,87.0,1740.0,0.0,0.0,0.0,0.0,0.0,4.0,378.0,17.0,10.0,2.0
75%,149.0,51.0,131.0,2620.0,1.0,0.0,0.0,0.0,1.0,4.0,455.0,18.0,11.0,2.0
max,162.0,68.0,174.0,3480.0,1.0,1.0,1.0,1.0,1.0,4.0,544.0,18.0,13.0,2.0


## 4) Wants to convert some numeric values for categorical? Do it now


### 4.1) Choose columns to convert to categorical, for example subject_id, trial_id, memory span and others should be categorical

In [None]:
def choose_colums(colums):
  return colums
# Widget that will be shwon at the bottom, after running the box
categorical_colums_dropdown = widgets.SelectMultiple(
    options=merged_df.columns,
    description='Select colums to drop:',
)

interact(choose_colums, colums=categorical_colums_dropdown)

interactive(children=(SelectMultiple(description='Select colums to drop:', options=('RECORDING_SESSION_LABEL',…

<function __main__.choose_colums>

In [None]:
colums_to_convert_to_categorical = list(categorical_colums_dropdown.value)
for col in colums_to_convert_to_categorical:
  merged_df[col] = merged_df[col].astype('category')

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 344666 entries, 0 to 344665
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   RECORDING_SESSION_LABEL      344666 non-null  int64   
 1   TRIAL_LABEL                  344666 non-null  category
 2   TRIAL_INDEX                  344666 non-null  int64   
 3   BIN_INDEX                    344666 non-null  int64   
 4   BIN_START_TIME               344666 non-null  int64   
 5   IA_1_ID                      344666 non-null  category
 6   IA_2_ID                      344666 non-null  category
 7   IA_3_ID                      344666 non-null  category
 8   IA_4_ID                      344666 non-null  category
 9   IA_0_ID                      344666 non-null  category
 10  AVERAGE_IA_1_SAMPLE_COUNT_%  344666 non-null  float64 
 11  AVERAGE_IA_2_SAMPLE_COUNT_%  344666 non-null  float64 
 12  AVERAGE_IA_3_SAMPLE_COUNT_%  344666 non-null

In [None]:
merged_df.describe(include=['category'])

Unnamed: 0,TRIAL_LABEL,IA_1_ID,IA_2_ID,IA_3_ID,IA_4_ID,IA_0_ID,condition,critical,digit_span_list,lowspan_1_highspan_2
count,344666,344666,344666,344666,344666,344666,344666,344666,344666,344666
unique,68,1,1,1,1,1,4,2,102,2
top,Trial: 9,1: Target,2: Filler_1,3: Distractor,4: Filler_2,0: NULL,0,n,"[7.wav, 1.wav, 1.wav, 1.wav, 1.wav, 1.wav, 1.w...",2
freq,5075,344666,344666,344666,344666,344666,180950,182700,15212,178307


In [None]:
merged_df.describe()

Unnamed: 0,RECORDING_SESSION_LABEL,TRIAL_INDEX,BIN_INDEX,BIN_START_TIME,AVERAGE_IA_1_SAMPLE_COUNT_%,AVERAGE_IA_2_SAMPLE_COUNT_%,AVERAGE_IA_3_SAMPLE_COUNT_%,AVERAGE_IA_4_SAMPLE_COUNT_%,AVERAGE_IA_0_SAMPLE_COUNT_%,load,order,type,Farward WAIS scoring
count,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0,344666.0
mean,133.917703,34.486895,86.905839,1738.116786,0.343277,0.059222,0.066202,0.054963,0.280444,2.501549,390.897878,15.392473,9.448449
std,18.228435,19.627711,50.478002,1009.560048,0.471867,0.23303,0.245601,0.224845,0.441432,1.500001,77.55644,3.123467,1.714253
min,102.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,273.0,9.0,5.0
25%,120.0,17.0,43.0,860.0,0.0,0.0,0.0,0.0,0.0,1.0,327.0,13.0,8.0
50%,134.0,34.0,87.0,1740.0,0.0,0.0,0.0,0.0,0.0,4.0,378.0,17.0,10.0
75%,149.0,51.0,131.0,2620.0,1.0,0.0,0.0,0.0,1.0,4.0,455.0,18.0,11.0
max,162.0,68.0,174.0,3480.0,1.0,1.0,1.0,1.0,1.0,4.0,544.0,18.0,13.0


# Ploting the raw data we have so far

## In the next few boxes we are going through a process of plotting the trials a certain Subject was going through

### 1) Histogram
#### 1.1) Select Prameter to check its histogram

In [None]:
def choose_colums(colums):
  return colums
hist_parameter_selection = widgets.Dropdown(
    options=merged_df.columns,
    description='Histogram Parameter:',
)

interact(choose_colums, colums=hist_parameter_selection)

interactive(children=(Dropdown(description='Histogram Parameter:', options=('RECORDING_SESSION_LABEL', 'TRIAL_…

<function __main__.choose_colums>

#### 1.2) Plot!

In [None]:
hist_column_name = hist_parameter_selection.value
hist_col = merged_df[hist_column_name]
p = figure(
      # plot_width=400, plot_height=400,
      title="Histogram",
      x_axis_label="x",
      y_axis_label="y"
  )
  p.quad(x=x_name, y=y_name, source=raw_data_source)

IndentationError: ignored

## 3) Basic Scatter

## 4) Per Subject  Ploting

### 4.1) Select the name of the column you wish to group by, usually the `subject` or `trial` columns

In [None]:
def col_name(column_name):
  return column_name

groupped_column_dropdown = widgets.Dropdown(
    options=merged_df.columns,
    description='Subject:',
)

interact(col_name, column_name=groupped_column_dropdown)

interactive(children=(Dropdown(description='Subject:', options=('RECORDING_SESSION_LABEL', 'TRIAL_LABEL', 'TRI…

<function __main__.col_name>

### 4.2) Select the desire columns you wish to have as your `y` and `x` axis.
#### For `y`, usually the column that follow the eye gaze on the `Target`, but could also be for the `Competitor` or one of the `fillers`
#### For `x`, usually the time column

In [None]:
def choose_axis(axis):
  return axis

y_column_dropdown = widgets.Dropdown(
    options=merged_df.columns,
    description='Select `Y` axis column name:',
)
x_column_dropdown = widgets.Dropdown(
    options=merged_df.columns,
    description='Select `X` axis column name:',
)

interact(choose_axis, axis=y_column_dropdown)
interact(choose_axis, axis=x_column_dropdown)


interactive(children=(Dropdown(description='Select `Y` axis column name:', options=('RECORDING_SESSION_LABEL',…

interactive(children=(Dropdown(description='Select `X` axis column name:', options=('RECORDING_SESSION_LABEL',…

<function __main__.choose_axis>

### 4.3) Select the different features (e.g memory span, critical, load, noise level) you want to create the grid by

In [None]:
def choose_colums(colums):
  return colums
# Widget that will be shwon at the bottom, after running the box
features_column_dropdown = widgets.SelectMultiple(
    options=merged_df.columns,
    description='Select colums to drop:',
)

interact(choose_colums, colums=features_column_dropdown)

interactive(children=(SelectMultiple(description='Select colums to drop:', options=('RECORDING_SESSION_LABEL',…

<function __main__.choose_colums>

### 4.4) Plot!!

#### 4.4.1) Per Selection matrix

In [None]:
groupping_col_name = groupped_column_dropdown.value
y_name = y_column_dropdown.value
x_name = x_column_dropdown.value
grid_features = list(features_column_dropdown.value)

plots = create_plots(merged_df, groupping_col_name, y_name, x_name, grid_features)
show(gridplot(chunks(plots, len(grid_features))))



Groupping by -  RECORDING_SESSION_LABEL
y value by -  AVERAGE_IA_2_SAMPLE_COUNT_%
x value by -  BIN_START_TIME
feature matrix -  ['condition', 'critical', 'load']


#### 4.4.2) General Average Matrix

In [None]:
plots = plot_general_avg_grid(merged_df, y_name, x_name, grid_features)
show(gridplot(chunks(plots, len(grid_features))))

#### 4.4.1) Total Avarage

In [None]:
show(plot_general_avg(merged_df, y_name, x_name))

# Filtering and Data Cleaning


## 1) Filter By Column Values
#### You can repeat the folowing steps in order to clean different values from different columns


In [None]:
merged_df = merged_df.dropna()

### 1.1) Choose Column to filter by

In [None]:
def col_name(column_name):
  return column_name

filter_column_dropdown = widgets.Dropdown(
    options=merged_df.columns,
    description='Col Name:',
)

interact(col_name, column_name=filter_column_dropdown)

interactive(children=(Dropdown(description='Col Name:', options=('RECORDING_SESSION_LABEL', 'TRIAL_LABEL', 'TR…

<function __main__.col_name>

### 1.2) Choose value to remove from the data

In [None]:
col_name = filter_column_dropdown.value
values = list(merged_df[col_name].unique())
def value_name(value):
  return value

value_name_dropdown = widgets.Dropdown(
    options=values,
    description='Value Name:',
)

interact(value_name, value=value_name_dropdown)

interactive(children=(Dropdown(description='Value Name:', options=('n', 'y'), value='n'), Output()), _dom_clas…

<function __main__.value_name>

### 1.3) Filter that value out!

In [None]:
value = value_name_dropdown.value
selector = (merged_df[col_name] != value)
merged_df = merged_df[selector]

### 1.4) Printing Summary and the same graphs as before

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161966 entries, 700 to 344665
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   RECORDING_SESSION_LABEL      161966 non-null  int64   
 1   TRIAL_LABEL                  161966 non-null  category
 2   TRIAL_INDEX                  161966 non-null  int64   
 3   BIN_INDEX                    161966 non-null  int64   
 4   BIN_START_TIME               161966 non-null  int64   
 5   IA_1_ID                      161966 non-null  category
 6   IA_2_ID                      161966 non-null  category
 7   IA_3_ID                      161966 non-null  category
 8   IA_4_ID                      161966 non-null  category
 9   IA_0_ID                      161966 non-null  category
 10  AVERAGE_IA_1_SAMPLE_COUNT_%  161966 non-null  float64 
 11  AVERAGE_IA_2_SAMPLE_COUNT_%  161966 non-null  float64 
 12  AVERAGE_IA_3_SAMPLE_COUNT_%  161966 non-nu

In [None]:
merged_df.describe(include=['category'])

Unnamed: 0,RECORDING_SESSION_LABEL,TRIAL_INDEX,condition,critical,load,noise_or_quiet,lowspan_1_highspan_2_y
count,70613,70613,70613,70613,70613.0,70613,70613
unique,26,49,2,1,1.0,1,2
top,104,22,r,y,1.0,SNR,2
freq,2800,2275,35553,70613,70613.0,70613,36337


In [None]:
merged_df.describe()

Unnamed: 0,BIN_START_TIME,AVERAGE_IA_1_SAMPLE_COUNT_%,AVERAGE_IA_2_SAMPLE_COUNT_%,AVERAGE_IA_3_SAMPLE_COUNT_%,AVERAGE_IA_4_SAMPLE_COUNT_%,AVERAGE_IA_0_SAMPLE_COUNT_%,noise_level,order,type,withha_onset,withoutha_onset
count,70613.0,70613.0,70613.0,70613.0,70613.0,70613.0,70613.0,70613.0,70613.0,70613.0,70613.0
mean,1734.426239,0.37363,0.052694,0.065311,0.049405,0.283,4.0,386.458584,12.370328,971.357753,1106.154986
std,1007.977804,0.480585,0.220407,0.24361,0.213538,0.442217,0.0,64.963883,2.282702,82.906979,86.921963
min,0.0,0.0,0.0,0.0,0.0,0.0,4.0,310.0,9.0,818.0,925.0
25%,860.0,0.0,0.0,0.0,0.0,0.0,4.0,337.0,11.0,897.0,1031.0
50%,1740.0,0.0,0.0,0.0,0.0,0.0,4.0,360.0,13.0,953.0,1121.0
75%,2600.0,1.0,0.0,0.0,0.0,1.0,4.0,457.0,15.0,1065.0,1168.0
max,3480.0,1.0,1.0,1.0,1.0,1.0,4.0,509.0,16.0,1131.0,1271.0


In [None]:
plots = create_plots(group_col, groupping_col_name, y_name, x_name, grid_features)
show(gridplot(chunks(plots, len(grid_features))))

Groupping by -  TRIAL_INDEX
y value by -  AVERAGE_IA_1_SAMPLE_COUNT_%
x value by -  BIN_START_TIME
feature matrix -  ['condition', 'load', 'lowspan_1_highspan_2']


In [None]:
plots = plot_general_avg_grid(group_col, y_name, x_name, grid_features)
show(gridplot(chunks(plots, len(grid_features))))

In [None]:
show(plot_general_avg(group_col, y_name, x_name))

## 2) Filter according to exteral files.

### Here we will use the idea behind anti join to drop certain rows accroding to files you have prepared in advance

## 2.1) If you alerady uploaded the file you can select it 

In [None]:
def choose_filename(filename):
  return filename
anti_join_files_name_dropdown = widgets.SelectMultiple(
    options=os.listdir(),
    description='File name:',
)

interact(choose_filename, filename=anti_join_files_name_dropdown)

interactive(children=(SelectMultiple(description='File name:', options=('.config', 'corrected_span_groups.csv'…

<function __main__.choose_filename>

In [None]:
cleaning_dfs = {}
cleaning_file_names = []
anti_join_file_names = list(anti_join_files_name_dropdown.value)
for file_name in anti_join_file_names:
  cleaning_file_names.append(file_name)
  cleaning_dfs[file_name] = pd.read_csv(file_name)


## 2.1) If you have not uploaded the file already, please upload it
## If you used the 2 boxes above please skip these 2

In [None]:
anti_merge_uploaded = files.upload()

Saving bad_experiments.csv to bad_experiments.csv


In [None]:
cleaning_dfs = {}
cleaning_file_names = []
for file_name, byte_file in anti_merge_uploaded.items():
  cleaning_file_names.append(file_name)
  cleaning_dfs[file_name] = pd.read_csv(io.StringIO(byte_file.decode("utf-8")))

## 2.3) Choose the unique column names you wish to anti-join.
## This columns hsould create a unique like key so we can tell our notebook to remove each row that corresponds from the file to the data we have cleaning and merged so far

In [None]:
def choose_colums(colums):
  return colums

# Widget that will be shwon at the bottom, after running the box
external_clean_columns_dropdown = []
for i in range(0,len(cleaning_file_names)):
  w1 = widgets.SelectMultiple(
      options=cleaning_dfs[cleaning_file_names[i]].columns,
      description=cleaning_file_names[i]
  )
  w2 = widgets.SelectMultiple(
      options=merged_df.columns,
      description='primary data frame'
  )
  external_clean_columns_dropdown.append(widgets.HBox([w1,w2]))
  
columns_map = widgets.VBox(external_clean_columns_dropdown)
columns_map

VBox(children=(HBox(children=(SelectMultiple(description='bad_experiments.csv', options=('RECORDING_SESSION_LA…

In [None]:
for i in range(len(cleaning_file_names)):
  child = columns_map.children[i-1]
  left = list(child.children[1].value)
  right = list(child.children[0].value)
  merged_df = anti_join(merged_df, cleaning_dfs[cleaning_file_names[i]], left_on=left, right_on=right)

### 2.4) Printing Summary and the same graphs as before

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5600 entries, 128261 to 251239
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   RECORDING_SESSION_LABEL      5600 non-null   int64   
 1   TRIAL_INDEX_x                5600 non-null   category
 2   BIN_START_TIME               5600 non-null   int64   
 3   AVERAGE_IA_1_SAMPLE_COUNT_%  5600 non-null   float64 
 4   AVERAGE_IA_2_SAMPLE_COUNT_%  5600 non-null   float64 
 5   AVERAGE_IA_3_SAMPLE_COUNT_%  5600 non-null   float64 
 6   AVERAGE_IA_4_SAMPLE_COUNT_%  5600 non-null   float64 
 7   AVERAGE_IA_0_SAMPLE_COUNT_%  5600 non-null   float64 
 8   condition                    5600 non-null   category
 9   critical                     5600 non-null   category
 10  load                         5600 non-null   category
 11  noise_level                  5600 non-null   float64 
 12  noise_or_quiet               5600 non-null   category
 

In [None]:
merged_df.describe(include=['category'])

Unnamed: 0,TRIAL_INDEX_x,condition,critical,load,noise_or_quiet,lowspan_1_highspan_2_y
count,5600,5600,5600,5600.0,5600,5600
unique,32,2,1,1.0,1,2
top,68,r,y,1.0,SNR,2
freq,175,2800,5600,5600.0,5600,2800


In [None]:
merged_df.describe()

Unnamed: 0,RECORDING_SESSION_LABEL,BIN_START_TIME,AVERAGE_IA_1_SAMPLE_COUNT_%,AVERAGE_IA_2_SAMPLE_COUNT_%,AVERAGE_IA_3_SAMPLE_COUNT_%,AVERAGE_IA_4_SAMPLE_COUNT_%,AVERAGE_IA_0_SAMPLE_COUNT_%,noise_level,order,type,withha_onset,withoutha_onset,TRIAL_INDEX_y
count,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,0.0
mean,138.5,1740.0,0.510057,0.047627,0.047445,0.026887,0.210484,4.0,477.59375,13.0,955.75,1094.3125,
std,10.500938,1010.436697,0.497052,0.210634,0.209596,0.158731,0.400215,0.0,19.677026,2.236268,102.201652,99.131073,
min,128.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,446.0,10.0,818.0,925.0,
25%,128.0,860.0,0.0,0.0,0.0,0.0,0.0,4.0,460.25,11.5,872.5,1008.75,
50%,138.5,1740.0,1.0,0.0,0.0,0.0,0.0,4.0,477.5,13.0,928.0,1105.5,
75%,149.0,2620.0,1.0,0.0,0.0,0.0,0.0,4.0,495.5,14.5,1042.5,1158.25,
max,149.0,3480.0,1.0,1.0,1.0,1.0,1.0,4.0,509.0,16.0,1131.0,1271.0,


In [None]:
plots = create_plots(merged_df, groupping_col_name, y_name, x_name, grid_features)
show(gridplot(chunks(plots, len(grid_features))))

In [None]:
plots = plot_general_avg_grid(merged_df, group_col, y_name, x_name, grid_features)
show(gridplot(chunks(plots, len(grid_features))))

In [None]:
show(plot_general_avg(merged_df, group_col, y_name, x_name))

# Saving files
## At ani stage you can choose to save the current state of our manipulated data

In [None]:
from datetime import datetime
def save_locally_and_update(df, original_dfs, cleaning_dfs, name_prefix="bck"):
  # df.info()
  researcher = "anonymous_researcher"
  try:
    researcher = researcher_name
  except Exception as e:
    pass
  experiment = "anonymous_experiment"
  try:
    experiment = experiment_name
  except Exception as e:
    pass
  primary_file_name = "anonymous_primary_file_name"
  try:
    primary_file_name = data_frame_names[0]
  except Exception as e:
    pass
  date_str = datetime.now().strftime("%m-%d-%Y-%H:%M:%S")
  name = os.path.join(str(os.getcwd()),"{}_{}_{}_{}_{}.csv".format(researcher, experiment, primary_file_name, name_prefix, date_str))
  prefix = "{}/{}".format(researcher, experiment)
  name_for_bucket = "{}/{}_{}_{}.csv".format(prefix, primary_file_name, name_prefix, date_str)
  df.to_csv(name, index=False)
    
  # TODO - save to some bucket

  creds = {
    "type": "service_account",
    "project_id": "tranquil-sunup-283012",
    "private_key_id": "f438d8b6739db03d8028cd085e29b4c53ff1b867",
    "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC8DRokmIiTBhfG\nnpREGLLQCFBQH37dMlMg4BRZYihI1yKd3Tvq0DoawuA0b8KBFuVqFZgZl40pBWI+\neQPX8HhSzcEXpF3k9+hqaefIw+u/1mb5pHjtjKVSV7GsdUFM4TKotnIH35KKM54r\n9rdZr0obQ17X4TzHVHwrCe2yEMpAG77nu/B1UzTH4L+tXO8F5Z4qUN/ctyEbEepa\nHYfmn6qwWJ59pXG1cbVCAKJP7HzqJ9RAPuccAUMbvXjkkK9svBiaEYSGn/k5o55l\nWpqXfEF2Niv7vT2vFWWnCfWVWyMvGlR4n+0Zs7XAyMAavTKtILczU15lu+ht8yQz\nCXLngiFVAgMBAAECggEASAXqn96N3GWmgItm9OfSwIuWmwFdGQH3xa0dDhjinfdL\nylOb6bTDtFE0BtFRGRj4V9eacB7T0US/GndF6hQvOWOVk2UAEzyB1xPl0sZ7Cffn\na7C7IhxOi0mMAqXME1JjESwDY5GU8fqQF32APsi8pNF6R0t8eEma3u8ICz2UANYV\njr6Ut9lHSr40rYTm6TnJ0Kt9OvGY5RDFRWal9TGq6G5w2gvcqOGJElR77isMj22V\na8waqcNd8e5GsI9dL/Y+mjevJJtrBUbQ9Jl9QWlfefPn9+tUwjICXzkzcVHvywsN\nu2JLJ1+/Ez3EJPMg7ausGbo0IjUa28E2vA3qfSjmRwKBgQD/sRlGNISjF+GAJ6FV\nPMBlRp954CkON8KgGC6WozRRcTo5SUFvx7x3n3twckPl9nv876t3bDqmqNzI0jfe\nDj2I8VNSO0REaRENvyFng8z5qtaSbOle+qtZShaTVLSZZGAZx5bMILQ+m9Q3Cz0E\njx9YLJY0GdMP1XnobcIwzrPi+wKBgQC8RyGBWxrloAKZbJ3xV8QkgyLMBzmSVT22\nuN/7YJUD/fqw2xnwtxsK5WzXjqhFaooEg0j75hMyygageAfKFB+8PUc8m5FQ6I/u\na3Zj0Fbhus5tljLM6D94zTSMchIutHBpRBMKtizSJSKkIbAWChc9fOlhGgyfnX7v\nUPEh3gZb7wKBgQD4bl4cNFLbFAzps5exdcGJpUC17fJ1+f+EBXreqdvfdaAYoPCP\nZwXbRH1vF9aYzRBTBZsYAXRLEa7TAE1/1146fB90uljuDxeev6H5LbouqqqowmFN\nA0kRDEc7BwYiM8Cby6zc0LnQSx+6C5VRpK3Twh5+qMjFjalRB7OyMGPfmQKBgQCA\nX4uT1JehS5maHLoQTYRaVOOL870obmB2ztVBY9gW8bxVi/7C50ZUBpxQ2V8YfYz/\niLhhsL0UWzVrgovlGBWPVsTUqUnKvdctfC4r3mju3l1T0R5wIkbsyhXzUO/e0n72\n90h4fEBRRKq6+JFEZbr03M+PuqAy0MM0z56qNeVmFwKBgEnsX4epAg2JDl5svlvK\n3YuBbPy9qldDyW0DhG9kr2zjn3iRs/PYSp1U51gFPE3ssQPhF4folSLfVsSUbLVQ\nShVUiQNEKzZAl+Jc2aE+9Z8GhYDFWL7n/rKhXPRs5kBFArk9khMCBfvYW9d5vvqr\nJeSjvth76V1usCKpayZeAuN7\n-----END PRIVATE KEY-----\n",
    "client_email": "storage-manager@tranquil-sunup-283012.iam.gserviceaccount.com",
    "client_id": "116880534893651206526",
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
    "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/storage-manager%40tranquil-sunup-283012.iam.gserviceaccount.com"
  }
  creds_file_name = "data.json"
  creds_location = os.getcwd() + "/" + creds_file_name
  print(creds_location)
  with open(creds_location, 'w') as fp:
      json.dump(creds, fp)
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=creds_location

  
  storage_client = storage.Client()
  bucket = storage_client.bucket("outliers")
  blob = bucket.blob(name_for_bucket)
  blob.upload_from_filename(name)

  # Uploading rest of files - original files and cleaning files
  # for fname, dataframe in original_dfs.items():
  #   n = "{}/{}.csv".format(prefix,fname)
  #   blob = bucket.blob(name_for_bucket)
  #   blob.upload_from_filename(name)

  # for fname, dataframe in cleaning_dfs.items():
  #   blob = bucket.blob(name_for_bucket)
  #   blob.upload_from_filename(name)
    

  return name

## 1) Download to your own computer

In [None]:
# f = os.path.join(str(os.getcwd()), "anonymous_researcher_anonymous_experiment_primary_bck_07-11-2020-12:32:26.csv")
# merged_df = pd.read_csv(f)

In [None]:
saved_file_name = save_locally_and_update(merged_df, dfs, cleaning_dfs)
files.download(saved_file_name)

/content/data.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>