In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2018-02-17 11:45:58 

CPython 3.6.3
IPython 6.1.0

numpy 1.14.0
pandas 0.22.0
sklearn 0.19.1
matplotlib 2.1.0


- **Entropy** is measuring the average amount of information needed to communicate something. If we knew for sure what was going to happen, we wouldn't have to send any messages at all, on the other hand if there's two things that could happen with a 50/50 probability, we would need to send 1 bit of message to communicate that. Hence, the more concentrated the probability, the more we can craft a clever code with shorter average message length. Another way of viewing entropy, is that it measures the variation in the data, the larger the entropy the greater the variation we have in our data.
- **Cross Entropy** Measures the average length of communicating an event from one distribution with the optimal code for another distribution.
- **Kullback–Leibler divergence** or commonly just referred to as KL divergence measures the difference between cross entropy and entropy. This difference measures how much longer our messages are going to be because we used a code that's optimized for a different distribution. If the distributions are the same, the difference will be zero. On the other hand, the it will become bigger if the distribution's difference grows.

Cross entropy and KL divergence are incredibly useful in machine learning. Often, we want one distribution to be close to another. For example, we might want a predicted distribution to be close to the ground truth. Cross entropy and KL divergence gives us a natural way to do this, that's why we see it showing up everywhere.

## Mutual Information

In [2]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset = 'train')
newsgroups_test = fetch_20newsgroups(subset = 'test')
X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif

tfidf = TfidfVectorizer()
logistic = LogisticRegression(class_weight = 'balanced')
pipeline1 = Pipeline([
    ('tfidf', tfidf),
    ('logistic', logistic)
]).fit(X_train, y_train)

pipeline_pred_train = pipeline1.predict(X_train)
pipeline_pred_test = pipeline1.predict(X_test)
accuracy_score(y_train, pipeline_pred_train)

0.9728654764009192

In [4]:
accuracy_score(y_test, pipeline_pred_test)

0.8271375464684015

In [7]:
from kaggler.online_model import FTRL

tfidf = TfidfVectorizer()
logistic = FTRL(a=.1,                # alpha in the per-coordinate rate
           b=1,                 # beta in the per-coordinate rate
           l1=1.,               # L1 regularization parameter
           l2=1.,               # L2 regularization parameter
           n=2**20,             # number of hashed features
           epoch=1,             # number of epochs
           interaction=True)
pipeline3 = Pipeline([
    ('tfidf', tfidf),
    ('logistic', logistic)
]).fit(X_train, y_train)

pipeline_pred_train = pipeline3.predict(X_train)
pipeline_pred_test = pipeline3.predict(X_test)
accuracy_score(y_train, pipeline_pred_train)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [None]:
accuracy_score(y_test, pipeline_pred_test)

In [5]:
tfidf = TfidfVectorizer()
feature_selection = SelectKBest(chi2, k=5000)
logistic = LogisticRegression(class_weight = 'balanced')
pipeline2 = Pipeline([
    ('tfidf', tfidf),
    ('feature_selection', feature_selection),
    ('logistic', logistic)
]).fit(X_train, y_train)

pipeline_pred_train = pipeline2.predict(X_train)
pipeline_pred_test = pipeline2.predict(X_test)
accuracy_score(y_train, pipeline_pred_train)

0.9052501325791056

In [6]:
accuracy_score(y_test, pipeline_pred_test)

0.7952734997344663

In [8]:
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import SelectFromModel

elastic_net = ElasticNet(l1_ratio = 0.8)
tfidf = TfidfVectorizer()
feature_selection = SelectFromModel(elastic_net, threshold = 'mean', prefit = False)
logistic = LogisticRegression(class_weight = 'balanced')
pipeline3 = Pipeline([
    ('tfidf', tfidf),
    ('feature_selection', feature_selection),
    ('logistic', logistic)
]).fit(X_train, y_train)

pipeline_pred_train = pipeline3.predict(X_train)
pipeline_pred_test = pipeline3.predict(X_test)
accuracy_score(y_train, pipeline_pred_train)

0.9728654764009192

In [9]:
accuracy_score(y_test, pipeline_pred_test)

0.8271375464684015

In [7]:
# tfidf = TfidfVectorizer()
# feature_selection = SelectKBest(mutual_info_classif, k=5000)
# logistic = LogisticRegression(class_weight = 'balanced')
# pipeline2 = Pipeline([
#     ('tfidf', tfidf),
#     ('feature_selection', feature_selection),
#     ('logistic', logistic)
# ]).fit(X_train, y_train)

# pipeline_pred_train = pipeline2.predict(X_train)
# pipeline_pred_test = pipeline2.predict(X_test)
# accuracy_score(y_train, pipeline_pred_train)

In [8]:
# accuracy_score(y_test, pipeline_pred_test)

In [10]:
import mifs

tfidf = TfidfVectorizer()
feat_selector = mifs.MutualInformationFeatureSelector()
pipeline3 = Pipeline([
    ('tfidf', tfidf),
    ('feature_selection', feat_selector),
    ('logistic', logistic)
]).fit(X_train, y_train)

pipeline_pred_train = pipeline3.predict(X_train)
pipeline_pred_test = pipeline3.predict(X_test)
accuracy_score(y_train, pipeline_pred_train)

AttributeError: 'list' object has no attribute 'A'

In [None]:
accuracy_score(y_test, pipeline_pred_test)

In [None]:
hi

In [None]:
import nltk
from itertools import chain
from nltk.corpus import brown
from sklearn.feature_extraction.text import CountVectorizer

sentences = brown.sents(categories='news')
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(chain(*sentences))
X

In [None]:
# https://stackoverflow.com/questions/20491028/optimal-way-to-compute-pairwise-mutual-information-using-numpy
import numpy as np


def calc_MI(X,Y,bins):
    c_XY = np.histogram2d(X,Y,bins)[0]
    c_X = np.histogram(X,bins)[0]
    c_Y = np.histogram(Y,bins)[0]

    H_X = shan_entropy(c_X)
    H_Y = shan_entropy(c_Y)
    H_XY = shan_entropy(c_XY)

    MI = H_X + H_Y - H_XY
    return MI


def shan_entropy(c):
    c_normalized = c / float(np.sum(c))
    c_normalized = c_normalized[np.nonzero(c_normalized)]
    H = -sum(c_normalized* np.log2(c_normalized))  
    return H

A = np.array([[ 2.0,  140.0,  128.23, -150.5, -5.4  ],
              [ 2.4,  153.11, 130.34, -130.1, -9.5  ],
              [ 1.2,  156.9,  120.11, -110.45,-1.12 ]])

bins = 5 # ?
n = A.shape[1]
matMI = np.zeros((n, n))

for ix in np.arange(n):
    for jx in np.arange(ix+1,n):
        matMI[ix,jx] = calc_MI(A[:,ix], A[:,jx], bins)
        
matMI

In [None]:
mutual_info_score(A[:, 0], A[:, 1])

In [None]:
from sklearn.metrics import mutual_info_score

def calc_MI(x, y, bins):
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi


bins = 5 # ?
n = A.shape[1]
matMI = np.zeros((n, n))

for ix in np.arange(n):
    for jx in np.arange(ix+1,n):
        matMI[ix,jx] = calc_MI(A[:,ix], A[:,jx], bins)
        
matMI

In [1]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif
iris = load_iris()
X, y = iris.data, iris.target
X.shape

feature_selection = SelectKBest(chi2, k=2)
X_new = feature_selection.fit_transform(X, y)
X_new.shape

(150, 2)

In [2]:
%%timeit
feature_selection = SelectKBest(chi2, k=2)
X_new = feature_selection.fit_transform(X, y)
X_new.shape

457 µs ± 38.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
feature_selection.scores_

array([ 10.81782088,   3.59449902, 116.16984746,  67.24482759])

In [16]:
%%timeit
feature_selection = SelectKBest(mutual_info_classif, k=2)
X_new = feature_selection.fit_transform(X, y)
X_new.shape

9.48 ms ± 848 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
feature_selection.scores_

array([0.49593281, 0.25590719, 0.98758062, 0.97811196])

In [3]:
import mifs
feat_selector = mifs.MutualInformationFeatureSelector(n_features = 2)
X_new = feat_selector.fit_transform(X, y)
X_new.shape

(150, 2)

In [4]:
%%timeit
feat_selector = mifs.MutualInformationFeatureSelector(n_features = 2)
X_new = feat_selector.fit_transform(X, y)
X_new.shape

83.2 ms ± 4.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
mutual_info_score(X[:, 0], X[:, 1])

## Reference

- [Blog: Visual Information Theory](http://colah.github.io/posts/2015-09-Visual-Information/)
- http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection

In [None]:
filepath = 'https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv'
data = pd.read_csv(filepath)
data = data.dropna(how = 'any')
print('dimension:', data.shape)
data.head(6)

In [None]:
data.hist(bins = 50, figsize = (20, 15))
plt.show()

In [None]:
label_col = 'bad_loan'
num_cols = [
    'loan_amnt', 'int_rate', 'emp_length',
    'annual_inc', 'dti', 'delinq_2yrs',
    'revol_util', 'total_acc', 'longest_credit_length']
cat_cols = [
    'term', 'home_ownership', 'purpose',
    'addr_state', 'verification_status']

In [None]:
# extract target variable, perform
# a quick check of the target variable's skewness
label = data[label_col].values
data = data.drop(label_col, axis = 1)
print('labels distribution:', np.bincount(label) / label.size)

In [None]:
# train/validation stratified split
val_size = 0.1
test_size = 0.1
split_random_state = 1234
df_train, df_test, y_train, y_test = train_test_split(
    data, label, test_size = test_size,
    random_state = split_random_state, stratify = label)

df_train, df_val, y_train, y_val = train_test_split(
    df_train, y_train, test_size = val_size,
    random_state = split_random_state, stratify = y_train)

In [None]:
from sklearn.pipeline import Pipeline
from mlutils.transformers import Preprocessor
from sklearn.ensemble import RandomForestClassifier

feature_selection = SelectKBest(chi2, k=6)
preprocessor = Preprocessor(num_cols = num_cols, cat_cols = cat_cols)
rf = RandomForestClassifier(class_weight = 'balanced')

pipeline1 = Pipeline([
    ('preprocess', preprocessor),
    ('feature_selection', feature_selection),
    ('rf', rf)
]).fit(df_train, y_train)

In [None]:
from sklearn.pipeline import make_pipeline
from skrebate import ReliefF
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

genetic_data = pd.read_csv('https://github.com/EpistasisLab/scikit-rebate/raw/master/data/'
                           'GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv.gz',
                           sep='\t', compression='gzip')

features, labels = genetic_data.drop('class', axis=1), genetic_data['class']

In [None]:
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100),
                    RandomForestClassifier(n_estimators=100))

print(np.mean(cross_val_score(clf, features, labels)))

In [None]:
clf = RandomForestClassifier(n_estimators=100)
print(np.mean(cross_val_score(clf, features, labels)))

In [None]:
rf = RandomForestClassifier()
clf = make_pipeline(BoostARoota(clf = rf, metric='logloss'),
                    RandomForestClassifier(n_estimators=100))

print(np.mean(cross_val_score(clf, features, labels)))

In [None]:
import pandas as pd
from boostaroota import BoostARoota
import urllib
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()


#################
#Madelon Dataset
train_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
# download the file
raw_data = urllib.request.urlopen(train_url)
train = pd.read_csv(raw_data, delim_whitespace=True, header=None)
train.columns = ["Var"+str(x) for x in range(len(train.columns))]
labels_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'
raw_data = urllib.request.urlopen(labels_url)
labels = pd.read_csv(raw_data, delimiter=",", header=None)
labels.columns = ["Y"]


########################################################################################################################
#
#  Test that BoostARoota is working
#
########################################################################################################################
br = BoostARoota(clf = clf, metric='logloss')

br.fit(train,labels)
len(train.columns)
len(br.keep_vars_)
new_train = br.transform(train)
new_train2 = br.fit_transform(train,labels)


#Dimension Reduction
print("Original training set has " + str(train.shape) + " dimensions. \n" +\
"BoostARoota with .fit() and .transform() reduces to " + str(new_train.shape) + " dimensions. \n" +\
"BoostARoota with .fit_transform() reduces to " + str(new_train2.shape) + " dimensions.\n" +\
"The two methods may give a slightly different dimensions because of random variation as it is being refit")
