In [5]:
import numpy as np
import pandas as pd

import univariate

In [12]:
import numba
numba.config.NUMBA_DEFAULT_NUM_THREADS

8

In [6]:
# Example #1: Simple Controlled
# There are two independent variables which are x0 and x1 which are the sum of 
# random variables (x2,x3) and (x4,x5) respectively. The dependent variable, y,
# is the sum of x0 and x1. Therefore, the feature evaluation should reveal
# a strong association between the dependent variable and variables x0 through x5

n_obs = 50000
n_cols = 200

x = np.random.random_sample(n_obs * n_cols)
x = np.reshape(x, (n_obs, n_cols))

#sum23
x[:,0] = x[:,2] + x[:,3]
#sum45
x[:,1] = x[:,4] + x[:,5]

# Dependent variable (sum1234)
y = x[:,0] + x[:,1]

n_bins_x = 3
n_bins_y = 3

n_reps = 100

In [7]:
# First timing will include time to compile the numba code
%time info_matrix = univariate.screen_univariate(x, y, method='discrete', measure='mi', \
                                                 n_bins_x=n_bins_x, n_bins_y=n_bins_y, \
                                                 n_reps=n_reps, cscv_folds=None, target='cpu')

Wall time: 1.35 s


In [None]:
# Second timing should be actual time to complete
%time info_matrix = univariate.screen_univariate(x, y, method='discrete', measure='mi', \
                                                 n_bins_x=n_bins_x, n_bins_y=n_bins_y, \
                                                 n_reps=n_reps, cscv_folds=None, target='cpu')

In [11]:
var_names = ['rand' + str(i) for i in range(2,n_cols)]
var_names = ['sum23', 'sum45'] + var_names
info_matrix_df = pd.DataFrame(info_matrix, 
                              columns=['MI','Solo p-value', 'Unbiased p-value'], 
                              index=var_names)
print(info_matrix_df)

               MI  Solo p-value  Unbiased p-value
sum23    0.210773      0.009901          0.009901
sum45    0.205890      0.009901          0.009901
rand2    0.098321      0.009901          0.009901
rand3    0.096504      0.009901          0.009901
rand4    0.095109      0.009901          0.009901
rand5    0.095797      0.009901          0.009901
rand6    0.000006      0.940594          0.990099
rand7    0.000006      0.970297          0.990099
rand8    0.000056      0.257426          0.990099
rand9    0.000057      0.128713          0.990099
rand10   0.000041      0.366337          0.990099
rand11   0.000022      0.702970          0.990099
rand12   0.000026      0.633663          0.990099
rand13   0.000004      0.980198          0.990099
rand14   0.000071      0.128713          0.990099
rand15   0.000031      0.554455          0.990099
rand16   0.000064      0.168317          0.990099
rand17   0.000013      0.881188          0.990099
rand18   0.000016      0.722772          0.990099


In [13]:
# Example #2
# Friedman2 synthetic data set
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman2.html
from sklearn import datasets

x, y = datasets.make_friedman2(n_samples=100000, noise=0.5)
n_bins_x = 5
n_bins_y = 5

n_reps = 100

In [15]:
%time info_matrix = univariate.screen_univariate(x=x, y=y, \
                                                 method='discrete', measure='mi', \
                                                 n_bins_x=n_bins_x, n_bins_y=n_bins_y, \
                                                 n_reps=n_reps, target='cpu')

Wall time: 253 ms


In [20]:
info_matrix[:4,:]

array([[1.6840208e-02, 9.9009899e-03, 9.9009899e-03],
       [1.7804233e-02, 9.9009899e-03, 9.9009899e-03],
       [5.0712138e-02, 9.9009899e-03, 9.9009899e-03],
       [4.2264004e-05, 9.6039605e-01, 9.9009901e-01]], dtype=float32)

In [17]:
# Example #3
# Friedman3 synthetic data set
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman3.html

x, y = datasets.make_friedman3(n_samples=100000, noise=0.5)
n_bins_x = 5
n_bins_y = 5

n_reps = 100

In [19]:
%time info_matrix = univariate.screen_univariate(x=x, y=y, \
                                                 method='discrete', measure='mi', \
                                                 n_bins_x=n_bins_x, n_bins_y=n_bins_y, \
                                                 n_reps=n_reps, target='cpu')

Wall time: 271 ms


In [21]:
info_matrix[:4,:]

array([[1.6840208e-02, 9.9009899e-03, 9.9009899e-03],
       [1.7804233e-02, 9.9009899e-03, 9.9009899e-03],
       [5.0712138e-02, 9.9009899e-03, 9.9009899e-03],
       [4.2264004e-05, 9.6039605e-01, 9.9009901e-01]], dtype=float32)

In [22]:
# Example #4
# make_hastie synthetic data set (classification)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_hastie_10_2.html

x, y = datasets.make_hastie_10_2(n_samples=1000000)
n_bins_x = 5
n_bins_y = 2

n_reps = 100

In [23]:
%time info_matrix = univariate.screen_univariate(x=x, y=y, \
                                                 method='discrete', measure='mi', \
                                                 n_bins_x=n_bins_x, n_bins_y=n_bins_y, \
                                                 n_reps=n_reps, target='cpu')

Wall time: 6.44 s


In [24]:
# Example #5
# Real-world data set (CrowdAnalytix Australian Open Tennis shot classification)
# https://crowdanalytix.com/contests/from-ao-to-ai--predicting-how-points-end-in-tennis

mens_train = pd.read_csv("data/mens_train_file.csv")

mens_target = mens_train.outcome.copy()
d={"W":0, "UE":1, "FE":2}
mens_target = mens_target.map(d)

mens_train = mens_train.drop(['outcome', 'id', 'gender','train'], axis=1)

hit_d = {"B":0, "F":1, "U":2, "V":3}
mens_train.hitpoint = mens_train.hitpoint.map(hit_d)
mens_train['previous.hitpoint'] = mens_train['previous.hitpoint'].map(hit_d)

n_bins_x = 10
n_bins_y = 3

n_reps = 100

In [25]:
mens_train.shape

(5000, 24)

In [26]:
%time info_matrix = univariate.screen_univariate(x=mens_train.values.astype(float), \
                                                 y=mens_target.values, \
                                                 method='discrete', measure='mi', \
                                                 n_bins_x=n_bins_x, n_bins_y=n_bins_y, \
                                                 n_reps=100, cscv_folds=8, target='cpu')

Wall time: 2.63 s


In [27]:
info_matrix_df = pd.DataFrame(info_matrix, 
                              columns=['MI','Solo', 'Unbiased', 'P(<=median)'], 
                              index=mens_train.columns)
print(info_matrix_df.sort_values('MI', ascending=0))

                                          MI      Solo  Unbiased  P(<=median)
net.clearance                       0.290100  0.009901  0.009901     0.000000
previous.time.to.net                0.162295  0.009901  0.009901     0.000000
player.impact.depth                 0.132864  0.009901  0.009901     0.000000
previous.speed                      0.114182  0.009901  0.009901     0.000000
speed                               0.090911  0.009901  0.009901     0.000000
outside.sideline                    0.085856  0.009901  0.009901     0.000000
depth                               0.084499  0.009901  0.009901     0.000000
outside.baseline                    0.077132  0.009901  0.009901     0.000000
previous.distance.from.sideline     0.075054  0.009901  0.009901     0.000000
player.depth                        0.062107  0.009901  0.009901     0.028571
opponent.depth                      0.060286  0.009901  0.009901     0.028571
previous.net.clearance              0.058767  0.009901  0.009901

In [None]:
# Example #6
# Random data set for timing purposes - no associations
# 500k x 500 vars x 100 reps
n_obs = 500000
n_cols = 500

x = np.random.random_sample(n_obs * n_cols)
x = np.reshape(x, (n_obs, n_cols))

y = np.random.randint(1,4,n_obs).astype(np.float64)

n_bins_x = 5
n_bins_y = 3

n_reps = 100

In [None]:
import numba
numba.config.NUMBA_DEFAULT_NUM_THREADS

In [None]:
%time info_matrix = screen_univariate(x, y, method='discrete', measure='mi', \
                                      n_bins_x=n_bins_x, n_bins_y=n_bins_y, \
                                      n_reps=n_reps, target='cpu')

In [None]:
# Test on larger instance
# 100k x 1k vars x 100 reps
%time info_matrix = screen_univariate(x, y, method='discrete', measure='ur', \
                                      n_bins_x=n_bins_x, n_bins_y=n_bins_y, \
                                      n_reps=n_reps, cscv_folds=8, target='cpu')