In [None]:
# Demonstration notebook of the MCPTFeatureEvaluator class

In [1]:
import numpy as np
import pandas as pd

In [3]:
from mcpt.mcpt import MCPTFeatureEvaluator

In [9]:
import numba
print("numba version: " + numba.__version__)
print("Number of threads: " + str(numba.config.NUMBA_DEFAULT_NUM_THREADS))

numba version: 0.39.0dev0+9.gc4357d1
Number of threads: 8


In [10]:
# Example:
# CrowdAnalytix Australian Open tennis data set
# 5000 observations, 24 variables
# Combination of ints, floats, categoricals, and booleans
#
# Example: 'y' is a column in the pandas DataFrame being passed in
#

mens_train = pd.read_csv("../data/mens_train_file.csv")
mcptfe = MCPTFeatureEvaluator()
%time mcptfe.fit(mens_train, y='outcome', cols_to_discrete='serve', cscv_folds=None)



CPU times: user 886 ms, sys: 67.4 ms, total: 954 ms
Wall time: 331 ms


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=False)

In [4]:
# Example:
# CrowdAnalytix Australian Open tennis data set
# 5000 observations, 24 variables
# Combination of ints, floats, categoricals, and booleans
#
# Example: 'y' is a separate pandas Series
# Also when the 'verbose' setting is set to True, additional runtime statistics are presented
#
mens_train = pd.read_csv("../data/mens_train_file.csv")
y_train = mens_train['outcome'].copy()
mens_train = mens_train.drop('outcome', axis=1)

mcptfe = MCPTFeatureEvaluator(verbose=True)
%time mcptfe.fit(mens_train, y=y_train, cols_to_discrete='serve', cscv_folds=None)

Type selector time: 0.00659 sec
Preprocess time: 0.0169 sec
Column type: discrete
   Discrete masters cut time: 0.00117 sec
   Information calculation time: 0.00332 sec
Column type: numeric
   Numeric masters cut time: 0.0044 sec
   Information calculation time: 0.0111 sec
CPU times: user 118 ms, sys: 5.34 ms, total: 123 ms
Wall time: 67.1 ms




MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=True)

In [5]:
# The MCPT run results are located in the 'information' object
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,net.clearance,0.205472,0.009901,0.009901
1,previous.time.to.net,0.14412,0.009901,0.009901
2,player.impact.depth,0.119891,0.009901,0.009901
3,previous.speed,0.108454,0.009901,0.009901
4,outside.sideline,0.085856,0.009901,0.009901
5,speed,0.082325,0.009901,0.009901
6,outside.baseline,0.077132,0.009901,0.009901
7,previous.distance.from.sideline,0.069688,0.009901,0.009901
8,depth,0.066454,0.009901,0.009901
9,player.depth,0.051928,0.009901,0.009901


In [14]:
# Up to now, the 'discrete' method has been used to calculate the information
# measure. An alternative information measure calculation method is presented
# within scikit-learn. It involves using several nearest neighbors based
# calculations. While this may provide a slightly less-biased mutual information
# estimate (than manual binning), it is much more time consuming. It also
# produces slightly different results.
#
# Demonstrating 'knn'-based information calculation

mens_train = pd.read_csv("../data/mens_train_file.csv")
y_train = mens_train['outcome'].copy()
mens_train = mens_train.drop('outcome', axis=1)

mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_unbiased')
%time mcptfe.fit(mens_train, y=y_train, \
                 method='knn', measure='mi', \
                 cols_to_discrete=['serve'], \
                 n_reps=100, cscv_folds=None)



CPU times: user 3min 12s, sys: 173 ms, total: 3min 12s
Wall time: 33.9 s


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=False)

In [None]:
# The KNN-based feature evaluation method took over 30 seconds, while
# the manual discretization method took less than 100 ms!
# 
# Please use the KNN-based feature evaluation method with caution!

In [15]:
# In addition, the information calculations differ slightly
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,net.clearance,0.332106,0.009901,0.009901
1,player.impact.depth,0.179418,0.009901,0.009901
2,previous.time.to.net,0.167824,0.009901,0.009901
3,previous.speed,0.11357,0.009901,0.009901
4,player.depth,0.110116,0.009901,0.009901
5,depth,0.105527,0.009901,0.009901
6,opponent.depth,0.101176,0.009901,0.009901
7,speed,0.086899,0.009901,0.009901
8,outside.sideline,0.085856,0.009901,0.009901
9,outside.baseline,0.077132,0.009901,0.009901


In [None]:
######################################
## Some synthetic data set examples ##
######################################

In [16]:
###################################
## Example #1: Simple Controlled ##
###################################
# There are two independent variables which are x0 and x1 which will be the 
# sum of  random variables (rand2,rand3) and (rand4,rand5) respectively. We will rename
# these two variables as 'sum23' and 'sum45'.
#
# The dependent variable, y, is the sum of 'sum23' and sum45'
# 
# Therefore, the feature evaluation should reveal a strong association between 
# the dependent variable and variables 'sum23', 'sum45', and 'rand2' through 'rand5'
#
# Additional random variables will be added to the data set as noise.

n_obs = 50000
n_cols = 200

x = np.random.random_sample(n_obs * n_cols)
x = np.reshape(x, (n_obs, n_cols))

#sum23
x[:,0] = x[:,2] + x[:,3]
#sum45
x[:,1] = x[:,4] + x[:,5]

# Dependent variable (sum1234)
y = x[:,0] + x[:,1]

n_bins_x = 3
n_bins_y = 3

n_reps = 100

# convert them to a pandas dataframe with names
col_names = ['rand' + str(i) for i in range(2,n_cols)]
col_names = ['sum23', 'sum45'] + col_names
X = pd.DataFrame(x, columns=col_names)

In [17]:
mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_unbiased')
%time mcptfe.fit(X, y=y, \
                 n_bins_x=n_bins_x, \
                 n_bins_y=n_bins_y, \
                 n_reps=100, cscv_folds=None)

CPU times: user 8.93 s, sys: 436 ms, total: 9.37 s
Wall time: 2.22 s


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=False)

In [18]:
# Explore the information measures from the first variables in 'X'
mcptfe.information.head(10)

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,sum23,0.208363,0.009901,0.009901
1,sum45,0.204688,0.009901,0.009901
2,rand2,0.097008,0.009901,0.009901
3,rand3,0.096852,0.009901,0.009901
4,rand5,0.096036,0.009901,0.009901
5,rand4,0.095228,0.009901,0.009901
6,rand169,0.000146,0.019802,0.643564
7,rand110,0.000121,0.009901,0.940594
8,rand185,0.000112,0.049505,0.990099
9,rand196,0.000102,0.029703,0.990099


In [19]:
# The scikit-learn SelectorMixin requires the functionality of a
# 'get_support' function. The function provides a boolean mask 
# of the variables to be selected. The mask is based off of original
# column ordering and the designated 'selector_recipe' (univariate_unbiased) 
selected_columns = X.columns[mcptfe.get_support()]
mcptfe.information.loc[mcptfe.information['Variable'].isin(selected_columns)]

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,sum23,0.208363,0.009901,0.009901
1,sum45,0.204688,0.009901,0.009901
2,rand2,0.097008,0.009901,0.009901
3,rand3,0.096852,0.009901,0.009901
4,rand5,0.096036,0.009901,0.009901
5,rand4,0.095228,0.009901,0.009901


In [20]:
# Execute MCPTFeatureEvaluator on the same synthetic data set
# but with a different selector_recipe (univariate_cscv)
mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_cscv')
%time mcptfe.fit(X, y=y, \
                 n_bins_x=n_bins_x, \
                 n_bins_y=n_bins_y, \
                 n_reps=100, cscv_folds=8)

CPU times: user 1min 31s, sys: 2.79 s, total: 1min 34s
Wall time: 19.1 s


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_cscv', verbose=False)

In [21]:
# Since the P(<=median) value by itself primarily helps determine the
# top and bottom halves of variables in terms of information
# measure strength, it is best to combine in with the unbiased p-value
# measure if feature selection will be performed
mcptfe.information.head(10)

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value,P(<=median)
0,sum23,0.208363,0.009901,0.009901,0.0
1,sum45,0.204688,0.009901,0.009901,0.0
2,rand2,0.097008,0.009901,0.009901,0.0
3,rand3,0.096852,0.009901,0.009901,0.0
4,rand5,0.096036,0.009901,0.009901,0.0
5,rand4,0.095228,0.009901,0.009901,0.0
6,rand169,0.000146,0.009901,0.683168,0.014286
7,rand110,0.000121,0.009901,0.960396,0.142857
8,rand185,0.000112,0.019802,0.990099,0.085714
9,rand196,0.000102,0.079208,0.990099,0.085714


In [22]:
##############################################
## Example #2: Friedman2 synthetic data set ##
##############################################
#
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman2.html
#
# y(X) = (X[:,0] ** 2 + (X[:,1] * X[:,2] - 1 / (X[:,1] * X[:,3])) ** 2) ** 0.5 + noise * N(0,1)

from sklearn import datasets

X, y = datasets.make_friedman2(n_samples=100000, noise=0.5)
n_bins_x = 5
n_bins_y = 5

n_reps = 100

col_names = col_names = ['rand' + str(i) for i in range(4)]
X = pd.DataFrame(X, columns=col_names)

In [23]:
mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_unbiased')
%time mcptfe.fit(X, y=y, \
                 method='discrete', measure='mi', \
                 n_bins_x=n_bins_x, \
                 n_bins_y=n_bins_y, \
                 n_reps=100, cscv_folds=None)

CPU times: user 442 ms, sys: 42.8 ms, total: 485 ms
Wall time: 322 ms


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=False)

In [24]:
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,rand2,0.390108,0.009901,0.009901
1,rand1,0.264195,0.009901,0.009901
2,rand0,0.001705,0.009901,0.009901
3,rand3,0.000102,0.19802,0.544554


In [25]:
mcptfe.information[mcptfe.get_support()]

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,rand2,0.390108,0.009901,0.009901
1,rand1,0.264195,0.009901,0.009901
2,rand0,0.001705,0.009901,0.009901


In [26]:
##############################################
## Example #3: Friedman3 synthetic data set ##
##############################################
# 
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman3.html
#
# y(X) = arctan((X[:,1] * X[:,2] - 1 / (X[:,1] * X[:,3])) / X[:,0]) + noise * N(0, 1).

X, y = datasets.make_friedman3(n_samples=100000, noise=0.5)
n_bins_x = 5
n_bins_y = 5

n_reps = 100

col_names = col_names = ['rand' + str(i) for i in range(4)]
X = pd.DataFrame(X, columns=col_names)

In [27]:
mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_unbiased')
%time mcptfe.fit(X, y=y, \
                 method='discrete', measure='mi', \
                 n_bins_x=n_bins_x, \
                 n_bins_y=n_bins_y, \
                 n_reps=100, cscv_folds=None)

CPU times: user 444 ms, sys: 17.9 ms, total: 462 ms
Wall time: 293 ms


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=False)

In [28]:
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,rand2,0.05221,0.009901,0.009901
1,rand1,0.018514,0.009901,0.009901
2,rand0,0.016579,0.009901,0.009901
3,rand3,8.2e-05,0.376238,0.851485


In [29]:
#################################################################
## Example #4: make_hastie synthetic data set (classification) ##
#################################################################
#
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_hastie_10_2.html
#
# y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1

x, y = datasets.make_hastie_10_2(n_samples=1000000)
n_bins_x = 5
n_bins_y = 2

n_reps = 100

col_names = col_names = ['rand' + str(i) for i in range(10)]
X = pd.DataFrame(x, columns=col_names)

In [30]:
mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_unbiased')
%time mcptfe.fit(X, y=y, \
                 method='discrete', measure='mi', \
                 n_bins_x=n_bins_x, \
                 n_bins_y=n_bins_y, \
                 n_reps=100, cscv_folds=None)

CPU times: user 16.5 s, sys: 888 ms, total: 17.4 s
Wall time: 7.58 s


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=False)

In [31]:
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,rand2,0.017051,0.009901,0.009901
1,rand6,0.016991,0.009901,0.009901
2,rand4,0.016909,0.009901,0.009901
3,rand8,0.016857,0.009901,0.009901
4,rand7,0.01678,0.009901,0.009901
5,rand1,0.016724,0.009901,0.009901
6,rand9,0.016724,0.009901,0.009901
7,rand0,0.016671,0.009901,0.009901
8,rand3,0.016631,0.009901,0.009901
9,rand5,0.016567,0.009901,0.009901


In [32]:
mcptfe.information[mcptfe.get_support()]

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,rand2,0.017051,0.009901,0.009901
1,rand6,0.016991,0.009901,0.009901
2,rand4,0.016909,0.009901,0.009901
3,rand8,0.016857,0.009901,0.009901
4,rand7,0.01678,0.009901,0.009901
5,rand1,0.016724,0.009901,0.009901
6,rand9,0.016724,0.009901,0.009901
7,rand0,0.016671,0.009901,0.009901
8,rand3,0.016631,0.009901,0.009901
9,rand5,0.016567,0.009901,0.009901


In [33]:
#######################################################################
## Example #6: Random data set for timing purposes - no associations ##
#######################################################################
# 
# 500,000 observations
# 500 variables
# 100 mcpt reps

n_obs = 500000
n_cols = 500

X = np.random.random_sample(n_obs * n_cols)
X = np.reshape(X, (n_obs, n_cols))

y = np.random.randint(1,4,n_obs).astype(np.float64)

n_bins_x = 5
n_bins_y = 3

n_reps = 100

col_names = col_names = ['rand' + str(i) for i in range(n_cols)]
X = pd.DataFrame(X, columns=col_names)

In [34]:
mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_unbiased')
%time mcptfe.fit(X, y=y, \
                 method='discrete', measure='mi', \
                 n_bins_x=n_bins_x, \
                 n_bins_y=n_bins_y, \
                 n_reps=100, cscv_folds=None)

CPU times: user 5min 42s, sys: 31 s, total: 6min 13s
Wall time: 1min 37s


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=False)

In [None]:
# In addition to the synthetic data sets and the tennis data set example
# in this notebook, there are some additional data sets available in the 
# PMLB package (Penn Machine Learning Benchmark)
# 
# Unfortunately, these data sets are not too large in size, but they do
# offer data sets with varying data types

In [35]:
#####################################
## PMLB Example #1: Customer churn ##
#####################################
import pmlb

churn_df = pmlb.fetch_data('churn', return_X_y=False)

In [36]:
y = churn_df['target'].copy()
churn_df = churn_df.drop('target', axis=1)

In [39]:
mcptfe = MCPTFeatureEvaluator(impute=True)
%time mcptfe.fit(churn_df, y=y, \
                 cols_to_discrete=['state','area code'], \
                 cscv_folds=None)

CPU times: user 104 ms, sys: 8.02 ms, total: 112 ms
Wall time: 66.9 ms


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=True,
           selector_recipe='univariate_unbiased', verbose=False)

In [40]:
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,number customer service calls,0.033466,0.009901,0.009901
1,total day minutes,0.031431,0.009901,0.009901
2,total day charge,0.031431,0.009901,0.009901
3,international plan,0.025213,0.009901,0.009901
4,state,0.009827,0.009901,0.009901
5,number vmail messages,0.007921,0.009901,0.019802
6,voice mail plan,0.006816,0.009901,0.108911
7,total eve charge,0.004505,0.009901,0.683168
8,total eve minutes,0.004505,0.009901,0.683168
9,total intl calls,0.003405,0.009901,0.970297


In [41]:
mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_cscv')
%time mcptfe.fit(churn_df, y=y, \
                 cols_to_discrete=['state','area code'], \
                 cscv_folds=8)

CPU times: user 731 ms, sys: 73 ms, total: 804 ms
Wall time: 298 ms


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_cscv', verbose=False)

In [42]:
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value,P(<=median)
0,number customer service calls,0.033466,0.009901,0.009901,0.0
1,total day minutes,0.031431,0.009901,0.009901,0.0
2,total day charge,0.031431,0.009901,0.009901,0.0
3,international plan,0.025213,0.009901,0.009901,0.0
4,state,0.009827,0.009901,0.009901,0.0
5,number vmail messages,0.007921,0.009901,0.009901,0.0
6,voice mail plan,0.006816,0.009901,0.059406,0.0
7,total eve charge,0.004505,0.009901,0.673267,0.214286
8,total eve minutes,0.004505,0.009901,0.673267,0.214286
9,total intl calls,0.003405,0.009901,0.920792,0.471429


In [43]:
selected_columns = churn_df.columns[mcptfe.get_support()]
mcptfe.information.loc[mcptfe.information['Variable'].isin(selected_columns)]

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value,P(<=median)
0,number customer service calls,0.033466,0.009901,0.009901,0.0
1,total day minutes,0.031431,0.009901,0.009901,0.0
2,total day charge,0.031431,0.009901,0.009901,0.0
3,international plan,0.025213,0.009901,0.009901,0.0
4,state,0.009827,0.009901,0.009901,0.0
5,number vmail messages,0.007921,0.009901,0.009901,0.0


In [46]:
##############################
## PMLB Example #2: Vehicle ##
##############################
vehicle_df = pmlb.fetch_data('vehicle', return_X_y=False)
y = vehicle_df['target'].copy()
vehicle_df = vehicle_df.drop('target', axis=1)

mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_cscv')
%time mcptfe.fit(vehicle_df, y=y, \
                 cscv_folds=8)

CPU times: user 144 ms, sys: 36.1 ms, total: 180 ms
Wall time: 78.7 ms


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_cscv', verbose=False)

In [48]:
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value,P(<=median)
0,MINORVARIANCE,0.280788,0.009901,0.009901,0.0
1,SCATTER RATIO,0.278156,0.009901,0.009901,0.0
2,PR AXISRECTANGULAR,0.268996,0.009901,0.009901,0.0
3,ELONGATEDNESS,0.26412,0.009901,0.009901,0.0
4,MAJORVARIANCE,0.233151,0.009901,0.009901,0.0
5,MAX LENGTH ASPECT RATIO,0.176463,0.009901,0.009901,0.0
6,DISTANCE CIRCULARITY,0.155515,0.009901,0.009901,0.114286
7,RADIUS RATIO,0.145299,0.009901,0.009901,0.385714
8,COMPACTNESS,0.128962,0.009901,0.009901,0.5
9,CIRCULARITY,0.125524,0.009901,0.009901,0.585714


In [49]:
mcptfe = MCPTFeatureEvaluator(selector_recipe='univariate_unbiased')
%time mcptfe.fit(vehicle_df, y=y, \
                 method = 'knn', \
                 n_reps = 100, \
                 cscv_folds=None)

CPU times: user 57.5 s, sys: 719 ms, total: 58.3 s
Wall time: 18.6 s


MCPTFeatureEvaluator(convert_datetime=False, copy=True, impute=False,
           selector_recipe='univariate_unbiased', verbose=False)

In [50]:
mcptfe.information

Unnamed: 0,Variable,MI,Solo p-value,Unbiased p-value
0,MINORVARIANCE,0.91536,0.009901,0.009901
1,RADIUS RATIO,0.343377,0.405941,0.990099
2,GYRATIONRADIUS,0.313067,0.009901,0.990099
3,SCATTER RATIO,0.143976,0.049505,0.990099
4,MAJORVARIANCE,0.052982,0.316832,0.990099
5,COMPACTNESS,0.0,0.009901,0.990099
6,MAJORKURTOSIS,0.0,0.009901,0.990099
7,MINORKURTOSIS,0.0,0.009901,0.990099
8,MINORSKEWNESS,0.0,0.009901,0.990099
9,MAJORSKEWNESS,0.0,0.009901,0.990099
