In [1]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 20 00:28:16 2018

@author: dhritiman
"""


import os
import functools
import operator
import gzip
import struct
import array
import tempfile
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve  # py2
try:
    from urllib.parse import urljoin
except ImportError:
    from urlparse import urljoin
import numpy as np


# the url can be changed by the users of the library (not a constant)
datasets_url = 'http://yann.lecun.com/exdb/mnist/'


class IdxDecodeError(ValueError):
    """Raised when an invalid idx file is parsed."""
    pass


def download_file(fname, target_dir=None, force=False):
    """Download fname from the datasets_url, and save it to target_dir,
    unless the file already exists, and force is False.

    Parameters
    ----------
    fname : str
        Name of the file to download

    target_dir : str
        Directory where to store the file

    force : bool
        Force downloading the file, if it already exists

    Returns
    -------
    fname : str
        Full path of the downloaded file
    """
    if not target_dir:
        target_dir = tempfile.gettempdir()
    target_fname = os.path.join(target_dir, fname)

    if force or not os.path.isfile(target_fname):
        url = urljoin(datasets_url, fname)
        urlretrieve(url, target_fname)

    return target_fname


def parse_idx(fd):
    """Parse an IDX file, and return it as a numpy array.

    Parameters
    ----------
    fd : file
        File descriptor of the IDX file to parse

    endian : str
        Byte order of the IDX file. See [1] for available options

    Returns
    -------
    data : numpy.ndarray
        Numpy array with the dimensions and the data in the IDX file

    1. https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment
    """
    DATA_TYPES = {0x08: 'B',  # unsigned byte
                  0x09: 'b',  # signed byte
                  0x0b: 'h',  # short (2 bytes)
                  0x0c: 'i',  # int (4 bytes)
                  0x0d: 'f',  # float (4 bytes)
                  0x0e: 'd'}  # double (8 bytes)

    header = fd.read(4)
    if len(header) != 4:
        raise IdxDecodeError('Invalid IDX file, file empty or does not contain a full header.')

    zeros, data_type, num_dimensions = struct.unpack('>HBB', header)

    if zeros != 0:
        raise IdxDecodeError('Invalid IDX file, file must start with two zero bytes. '
                             'Found 0x%02x' % zeros)

    try:
        data_type = DATA_TYPES[data_type]
    except KeyError:
        raise IdxDecodeError('Unknown data type 0x%02x in IDX file' % data_type)

    dimension_sizes = struct.unpack('>' + 'I' * num_dimensions,
                                    fd.read(4 * num_dimensions))

    data = array.array(data_type, fd.read())
    data.byteswap()  # looks like array.array reads data as little endian

    expected_items = functools.reduce(operator.mul, dimension_sizes)
    if len(data) != expected_items:
        raise IdxDecodeError('IDX file has wrong number of items. '
                             'Expected: %d. Found: %d' % (expected_items, len(data)))

    return np.array(data).reshape(dimension_sizes)


def download_and_parse_mnist_file(fname, target_dir=None, force=False):
    """Download the IDX file named fname from the URL specified in dataset_url
    and return it as a numpy array.

    Parameters
    ----------
    fname : str
        File name to download and parse

    target_dir : str
        Directory where to store the file

    force : bool
        Force downloading the file, if it already exists

    Returns
    -------
    data : numpy.ndarray
        Numpy array with the dimensions and the data in the IDX file
    """
    fname = download_file(fname, target_dir=target_dir, force=force)
    fopen = gzip.open if os.path.splitext(fname)[1] == '.gz' else open
    with fopen(fname, 'rb') as fd:
        return parse_idx(fd)


def train_images():
    """Return train images from Yann LeCun MNIST database as a numpy array.
    Download the file, if not already found in the temporary directory of
    the system.

    Returns
    -------
    train_images : numpy.ndarray
        Numpy array with the images in the train MNIST database. The first
        dimension indexes each sample, while the other two index rows and
        columns of the image
    """
    return download_and_parse_mnist_file('train-images-idx3-ubyte.gz')


def test_images():
    """Return test images from Yann LeCun MNIST database as a numpy array.
    Download the file, if not already found in the temporary directory of
    the system.

    Returns
    -------
    test_images : numpy.ndarray
        Numpy array with the images in the train MNIST database. The first
        dimension indexes each sample, while the other two index rows and
        columns of the image
    """
    return download_and_parse_mnist_file('t10k-images-idx3-ubyte.gz')


def train_labels():
    """Return train labels from Yann LeCun MNIST database as a numpy array.
    Download the file, if not already found in the temporary directory of
    the system.

    Returns
    -------
    train_labels : numpy.ndarray
        Numpy array with the labels 0 to 9 in the train MNIST database.
    """
    return download_and_parse_mnist_file('train-labels-idx1-ubyte.gz')


def test_labels():
    """Return test labels from Yann LeCun MNIST database as a numpy array.
    Download the file, if not already found in the temporary directory of
    the system.

    Returns
    -------
    test_labels : numpy.ndarray
        Numpy array with the labels 0 to 9 in the train MNIST database.
    """
    return download_and_parse_mnist_file('t10k-labels-idx1-ubyte.gz')


from __future__ import division

x_train = train_images()
x_test = test_images()

x_train = x_train.reshape((x_train.shape[0], x_train.shape[1] * 
                                x_train.shape[2]))

x_test = x_test.reshape((x_test.shape[0], x_test.shape[1] * 
                                x_test.shape[2]))

#Normalize the data
x_train = x_train/255
x_test = x_test/255
train_labels = train_labels()
test_labels = test_labels()

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# MNIST

## L2 Logistic Regression

In [3]:
model = LogisticRegression()

### Test set accuracy

In [4]:
model.fit(x_train, train_labels)
predictions = model.predict(x_test)
accuracy_score(test_labels, predictions)

0.92010000000000003

### Train set accuracy

In [5]:
predictions = model.predict(x_train)
accuracy_score(train_labels, predictions)

0.92788333333333328

### Top 30 coefficients

In [6]:
coefficients = np.sum(model.coef_, axis=0)

In [7]:
ind_lr = np.argpartition(coefficients, -30)[-30:]

In [8]:
f_coeffs = coefficients[ind_lr]

In [9]:
f_coeffs.shape

(30,)

In [10]:
f_coeffs

array([ 0.77997588,  0.78092778,  0.78402942,  0.80210089,  0.80449267,
        0.81816997,  0.89438744,  0.87742605,  0.90145823,  0.92353411,
        0.92661082,  1.56131923,  1.70069128,  1.44695792,  1.04207704,
        1.49046195,  1.1380128 ,  1.63448667,  1.6267958 ,  1.12630273,
        2.56200649,  1.85109873,  0.95514845,  1.13365936,  1.45140487,
        1.31979968,  2.29837151,  1.32322886,  0.94883686,  1.33403185])

## Decision Tree

In [11]:
model_dec = DecisionTreeClassifier()

### Test set accuracy

In [12]:
# MNIST DecTree
model_dec.fit(x_train, train_labels)
predictions = model_dec.predict(x_test)
accuracy_score(test_labels, predictions)

0.87870000000000004

### Train set accuracy

In [13]:
predictions = model_dec.predict(x_train)
accuracy_score(train_labels, predictions)

1.0

### Top 30 F-splits

In [14]:
model_dec.feature_importances_.shape

(784,)

In [15]:
ind = np.argpartition(model_dec.feature_importances_, -30)[-30:]

In [16]:
f_splits = model_dec.feature_importances_[ind]

In [17]:
f_splits.shape

(30,)

In [18]:
f_splits

array([ 0.00670604,  0.00670949,  0.00927382,  0.00860532,  0.00950561,
        0.01295473,  0.00780107,  0.00740432,  0.00708782,  0.0103048 ,
        0.01256605,  0.00817836,  0.01125268,  0.00910799,  0.01301591,
        0.01955284,  0.01658926,  0.02334537,  0.02014354,  0.04853667,
        0.01831606,  0.01346989,  0.0150033 ,  0.04443238,  0.04421645,
        0.02307007,  0.05409823,  0.02688075,  0.03858051,  0.03182818])

# 20NG

In [19]:
# 20 NG

from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(newsgroups_train.data)
vectors_train = vectorizer.transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

# NG LABELS
train_labels = newsgroups_train.target
test_labels = newsgroups_test.target

## L2 Logistic Regression

In [20]:
model = LogisticRegression()

### Test Set accuracy

In [21]:
model.fit(vectors_train, train_labels)
predictions = model.predict(vectors_test)
accuracy_score(test_labels, predictions)

0.82793414763674988

### Train Set accuracy

In [22]:
predictions = model.predict(vectors_train)
accuracy_score(train_labels, predictions)

0.96986035000883863

### Top 30 coeffients

In [23]:
coefficients = np.sum(model.coef_, axis=0)

In [24]:
ind_lr = np.argpartition(coefficients, -30)[-30:]

In [25]:
f_coeffs = coefficients[ind_lr]

In [26]:
f_coeffs.shape

(30,)

In [27]:
f_coeffs

array([ 0.70318708,  0.70561878,  0.70606958,  0.71529472,  0.74268857,
        0.71634711,  0.70767552,  0.71112062,  0.74520355,  0.87297082,
        0.74795028,  1.03302393,  0.84017614,  1.23718566,  0.81295133,
        0.91476254,  1.17681965,  1.02881162,  0.94128184,  0.82367258,
        0.98146202,  0.91139696,  0.98558677,  0.86566467,  0.95044618,
        1.09672473,  0.92239842,  0.88301474,  0.77757021,  0.77505414])

## Decision Tree

In [28]:
model_dec = DecisionTreeClassifier()

### Test Set accuracy

In [29]:
model_dec.fit(vectors_train, train_labels)
predictions = model_dec.predict(vectors_test)
accuracy_score(test_labels, predictions)

0.55509824747742964

### Train Set accuracy

In [30]:
model_dec.fit(vectors_train, train_labels)
predictions = model_dec.predict(vectors_train)
accuracy_score(train_labels, predictions)

0.99991161392964467

### Top 30 F-splits

In [31]:
model_dec.feature_importances_.shape

(130107,)

In [32]:
ind = np.argpartition(model_dec.feature_importances_, -30)[-30:]

In [33]:
f_splits = model_dec.feature_importances_[ind]

In [34]:
f_splits.shape

(30,)

In [35]:
f_splits

array([ 0.00543265,  0.00547868,  0.00565649,  0.00575063,  0.00615971,
        0.00608869,  0.00583506,  0.00695991,  0.0069749 ,  0.00852913,
        0.00844939,  0.00839107,  0.00792949,  0.00702591,  0.00808599,
        0.00946531,  0.01507008,  0.02658513,  0.01080522,  0.02319633,
        0.01073602,  0.01158107,  0.01871538,  0.01781211,  0.02389467,
        0.0180159 ,  0.01846134,  0.02042067,  0.02441666,  0.01583442])

# Spambase

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

In [37]:
data = pd.read_csv('/Users/dhritiman/Downloads/spambase/spambase.data', header=None)
data.rename(columns={57:'is_spam'}, inplace=True)

In [38]:
spam = data[data['is_spam'] == 1]
ham = data[data['is_spam'] == 0]

In [40]:
spam_train, spam_test = train_test_split(spam, train_size=0.6)
ham_train, ham_test = train_test_split(ham, train_size=0.6)

In [41]:
X_train = ham_train.append(spam_train)
y_train = X_train.pop('is_spam')

In [42]:
X_test = ham_test.append(spam_test)
y_test = X_test.pop('is_spam')

## L2 Logistic Regression

In [43]:
model = LogisticRegression()

### Test set accuracy

In [44]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy_score(y_test, predictions)

0.92779587404994568

### Train set accuracy

In [45]:
predictions = model.predict(X_train)
accuracy_score(y_train, predictions)

0.93258426966292129

### Top 30 coefficients

In [46]:
coefficients = np.sum(model.coef_, axis=0)

In [47]:
ind_lr = np.argpartition(coefficients, -30)[-30:]

In [49]:
f_coeffs = coefficients[ind_lr]

In [50]:
f_coeffs.shape

(30,)

In [51]:
f_coeffs

array([  6.10015842e-04,   7.64033753e-03,   4.78501266e-02,
         4.91755989e-02,   1.25024821e-01,   1.41810376e-01,
         2.52784022e-01,   2.45924123e-01,   2.08716401e-01,
         2.67375737e-01,   1.04832848e+00,   1.17343269e+00,
         6.96163462e-01,   2.22948891e+00,   1.12743448e+00,
         3.24934154e-01,   1.02273595e+00,   5.17089450e-01,
         2.71743688e+00,   8.75819936e-01,   4.48013485e-01,
         6.63966196e-01,   3.51080454e-01,   6.83660750e-01,
         4.58605272e-01,   3.83681382e+00,   1.08413247e+00,
         5.07790306e-01,   2.69082568e-01,   5.53922344e-01])

## Decision Tree

In [52]:
model = DecisionTreeClassifier()

### Test set accuracy

In [53]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [54]:
predictions = model.predict(X_test)
accuracy_score(y_test, predictions)

0.90228013029315957

### Train set accuracy

In [56]:
predictions = model.predict(X_train)
accuracy_score(y_train, predictions)

0.99963754983689745

### Top 30 F-splits

In [57]:
model.feature_importances_.shape

(57,)

In [58]:
ind = np.argpartition(model.feature_importances_, -30)[-30:]

In [59]:
f_splits = model.feature_importances_[ind]

In [60]:
f_splits.shape

(30,)

In [61]:
f_splits

array([ 0.00286872,  0.00302832,  0.00303041,  0.0040096 ,  0.00422451,
        0.00861683,  0.02021456,  0.03272419,  0.00579659,  0.01098553,
        0.00570411,  0.00935   ,  0.01422141,  0.01017135,  0.00896798,
        0.0583388 ,  0.00704576,  0.00757263,  0.02315709,  0.00634664,
        0.10606537,  0.00551273,  0.00833663,  0.01955408,  0.35507874,
        0.06738687,  0.00436786,  0.02714645,  0.11884535,  0.02092832])