<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Import-libraries" data-toc-modified-id="Import-libraries-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Import libraries</a></span><ul class="toc-item"><li><span><a href="#Data-processing" data-toc-modified-id="Data-processing-1.1.1"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>Data processing</a></span></li><li><span><a href="#Visualization" data-toc-modified-id="Visualization-1.1.2"><span class="toc-item-num">1.1.2&nbsp;&nbsp;</span>Visualization</a></span></li></ul></li><li><span><a href="#Parameter-settings" data-toc-modified-id="Parameter-settings-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Parameter settings</a></span></li><li><span><a href="#Import-Data" data-toc-modified-id="Import-Data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Import Data</a></span></li></ul></li><li><span><a href="#Helper-functions" data-toc-modified-id="Helper-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Helper functions</a></span><ul class="toc-item"><li><span><a href="#Identify-feature-type" data-toc-modified-id="Identify-feature-type-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Identify feature type</a></span></li><li><span><a href="#Data-structures" data-toc-modified-id="Data-structures-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Data structures</a></span></li></ul></li><li><span><a href="#Main-Functions" data-toc-modified-id="Main-Functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Main Functions</a></span><ul class="toc-item"><li><span><a href="#Visualization" data-toc-modified-id="Visualization-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Visualization</a></span><ul class="toc-item"><li><span><a href="#Discrete-Discrete-Confusion-Matrices" data-toc-modified-id="Discrete-Discrete-Confusion-Matrices-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>Discrete-Discrete Confusion Matrices</a></span></li><li><span><a href="#Discrete-Continuous-Violin-Plots" data-toc-modified-id="Discrete-Continuous-Violin-Plots-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>Discrete-Continuous Violin Plots</a></span></li></ul></li></ul></li></ul></div>

# Setup
## Import libraries
### Data processing

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import math
import random
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.metrics import normalized_mutual_info_score
from scipy.stats import multivariate_normal, pearsonr
import scipy.integrate as integrate
from sklearn.neighbors import KernelDensity
from pathlib import Path
import networkx as nx
import json

### Visualization

In [14]:
import matplotlib.pyplot as plt

#Seaborn
import seaborn as sns
sns.set_style("whitegrid")

#Plotly
import plotly
import plotly.tools as tls
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.colors import n_colors

# If you're using this code locally:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# If you're copying this into a jupyter notebook, add:
init_notebook_mode(connected=True)

## Parameter settings

In [3]:
chart = True # boolean for whether to display images while running computation
debug = True # boolean for whether to print updates to the console while running
output = True # boolean for whether to output json and pngs to files
charter = 'Plotly' # also accepts 'Seaborn'
resolution = 150 # int for resolution of output plots
discrete_threshold = 5 # number of responses below which numeric responses are considered discrete
compare_all = True # boolean; if comparing two lists of the same length, fill in list1 and list2 accordingly
#list1, list2 = [],[]
sample_n = 1000 # Work with all data (None), or just a sample?
cd = 'demo_widsdatathon2020/'

In [4]:
if output:
    Path(cd+'output/').mkdir(parents=True, exist_ok=True)
    Path(cd+'output/charts').mkdir(parents=True, exist_ok=True)
    Path(cd+'output/json').mkdir(parents=True, exist_ok=True)

## Import Data

In [5]:
if sample_n: df = pd.read_csv(cd+'data.csv').sample(sample_n)
else: df = pd.read_csv(cd+'data.csv')
ignore = []
for i in list(df.columns):
    try: np.ma.fix_invalid(df[i])
    except: pass
    if ('_id' in i):
        ignore.append(i)
df = df.drop(columns=ignore)
df = df.replace(np.nan, None)
df = df.replace('nan', None)
if debug: print(f'Loaded data from {cd} with {df.shape[0]} observations and {df.shape[1]} features')

Loaded data from demo_widsdatathon2020/ with 1000 observations and 182 features


# Helper functions
## Identify feature type

In [6]:
# Get a list of all response types
response_list = pd.DataFrame(columns=['responses','types'], index=list(df.columns))
response_list['responses']=[list(df[col].value_counts().index) for col in df.columns]

# Delete columns from the dataframe that only have one response
response_list['only_one_r'] = [(len(r)<2) for r in response_list['responses']]
only_one_r = list(response_list[response_list['only_one_r']==True].index)
df = df.drop(columns=only_one_r)
response_list = response_list.drop(index=only_one_r)

In [7]:
def get_types(U):
  types = {'floats':0,'strings':0,'nulls': 0}
  for i in response_list['responses'][U]:
    try:
      val = float(i)
      if math.isnan(val)==False:
        #print("Value",i," is a float")
        types['floats']+=1
      else:
        #print("Value",i," is null")
        types['nulls']+=1
    except ValueError:
      try:
        val = str(i)
        #print("Value",i,"is a string")
        types['strings']+=1
      except:
        print('Error: Unexpected value',i,'for feature',U)
  if ((types['floats']>0) & (types['strings']>0)):
    print('Column',U,'contains floats AND strings')
  return types

In [8]:
response_list['types']=[get_types(col) for col in df.columns]
response_list['string']=[t['strings']>0 for t in response_list['types']]
response_list['float']=[t['floats']>0 for t in response_list['types']]

# Classify features as discrete (fewer than {discrete_threshold} responses, or contains strings) or continuous (more than 15)
response_list['class']=['d' if ((len(r) < discrete_threshold) or (t['strings']>0)) else 'c' for r,t in zip(response_list['responses'],response_list['types'])]

# Store these groups in a list
discrete = list(response_list[response_list['class']=='d'].index)
continuous = list(response_list[response_list['class']=='c'].index)

if debug: print(f'Counted {len(discrete)} discrete features and {len(continuous)} features')

Counted 24 discrete features and 157 features


In [9]:
# Format the data as a string or a float
for i in list(response_list.index):
    V = []
    if (response_list['string'][i]==True) or (response_list['class'][i]=='d'):
        V=[str(v) for v in df[i]]
        df[i]=V
    elif response_list['float'][i]==True:
        V = df[i]
        V=[float(v) for v in df[i]]
        df[i]=V
    else: print('Error formatting column ',i)

## Data structures

In [10]:
def sparsify(series):
  ''' For discrete values: takes a column name and returns a sparse matrix (0 or 1) with a column for each unique response '''
  m=pd.DataFrame(columns=list(series.unique()))
  for i in list(series.unique()):
    m[i]=[int(x==i) for x in series]
  return m

def compute_bandwidth(X):
  ''' Takes a column name and computes suggested gaussian bandwidth with the formula: 1.06*var(n^-0.2) '''
  var = np.var(df[X])
  n = len(df[X].notnull())
  b = 1.06*var*(n**(-0.2))
  return b

# Main Functions
## Visualization
### Discrete-Discrete Confusion Matrices

In [11]:
def DD_viz(df):
    
  ''' Takes a filtered dataframe of two discrete feature columns and generates a heatmap '''

  U=list(df.columns)[0]
  V=list(df.columns)[1]

  i_range = list(df[U].unique())
  j_range = list(df[V].unique())
  s = pd.DataFrame(columns=i_range,index=j_range)
  for i in i_range:
    for j in j_range:
      s[i][j]=df[(df[U]==i) & (df[V]==j)].filter([U,V],axis=1).shape[0]
      mutual_support=s.sum().sum()
  s = s.astype(int)

  if charter=='Plotly':
    fig = ff.create_annotated_heatmap(
        s.values,
        x=[str(i) for i in i_range],
        y=[str(j) for j in j_range],
        colorscale = 'Blues'
    )
    fig.update_layout(
        xaxis_title = U.replace('_',' ').title(),
        yaxis_title = V.replace('_',' ').title(),
        plot_bgcolor="rgba(0, 0, 0, 0)", 
        paper_bgcolor="rgba(0, 0, 0, 0)",
    )
    fig.show()
    fig.update_xaxes(tickcolor='white',tickfont=dict(color='white'))
    fig.update_yaxes(tickcolor='white',tickfont=dict(color='white'))
    fig.update_layout(font=dict(color="white"))
        
    if output:
        with open(str(cd+'output/charts/'+U+'_'+V+'.json'), 'w') as outfile:
            json.dump(fig.to_json(), outfile)
        fig.write_image(str(cd+'output/charts/'+U+'_'+V+'.png'), scale=resolution//72)

        
  else:
    plt.clf()
    plt.figure(dpi=resolution)
    sns.heatmap(s, annot=True, cmap="Blues", cbar=False, linewidths=1)
    plt.xlabel(U.replace('_',' ').title())
    plt.ylabel(V.replace('_',' ').title()) 
    if output:
        plt.savefig(str(cd+'output/charts/'+U+'_'+V+'.png'), dpi=resolution)
    
  if output:
    with open(str(cd+'output/json/'+U+'_'+V+'.json'), 'w') as outfile:
        json.dump(df.to_json(), outfile)
    
  return 

In [12]:
if debug:
    try:
        DD_viz(df.filter(random.sample(discrete,2)).dropna(how='any'))
        print(f'Created an example discrete-discrete plot, generated using {charter}')
    except: pass

Created an example discrete-discrete plot, generated using Plotly


### Discrete-Continuous Violin Plots

In [36]:
def DC_viz(df):

  ''' Takes a subset dataframe of one continuous and one discrete feature and generates a Violin Plot '''

  U=list(df.columns)[0]
  V=list(df.columns)[1]
  
  if (U in continuous):
    D = V
    C = U
  else:
    D = U
    C = V
    
  if charter=='Plotly':
    s = sparsify(df[D])
    df = df.append(s)
    fig = go.Figure()
    print(s)
    #for i in df.nunique():
        #print(i)
        #fig.add_trace(go.Violin(x=s[i]))

    #fig.update_traces(orientation='h', side='positive', width=3, points=False)
    #fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
    #fig.show()
    
  else:
      sns.violinplot(df[D], df[C])
      if (len(df[D]) < 500): sns.swarmplot(x=df[D], y=df[C], edgecolor="white", linewidth=1) # Only show a swarm plot if there are fewer than 500 data points
      plt.xlabel(D)
      plt.ylabel(C)
    
      if output:
        plt.savefig(str(cd+'output/charts/'+U+'_'+V+'.png'), dpi=resolution)
  
  if output: df.to_json(str(cd+'output/json/'+U+'_'+V+'.json'))

  return

In [37]:
DC_viz(df.filter([random.choice(discrete),random.choice(continuous)]))

     Accident & Emergency  Operating Room / Recovery  Floor  Other Hospital  \
0                       1                          0      0               0   
1                       1                          0      0               0   
2                       1                          0      0               0   
3                       0                          1      0               0   
4                       1                          0      0               0   
5                       1                          0      0               0   
6                       1                          0      0               0   
7                       1                          0      0               0   
8                       1                          0      0               0   
9                       0                          1      0               0   
10                      1                          0      0               0   
11                      1                          0


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





ValueError: DataFrame index must be unique for orient='columns'.

In [21]:
random.choice(discrete)

'icu_admit_source'

In [23]:
df['icu_admit_source'].nunique()

5