In [None]:
from __future__ import division

In [None]:
import sys
sys.path.append('..')

In [None]:
import math
import random
import numpy as np
import pandas as pd

In [None]:
from sklearn import datasets

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
%load_ext autoreload
%autoreload 1

%aimport gtree
%aimport tools

In [7]:
import logging
%config Application.log_level="INFO"
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

## Goals:

### Separate the structure of a tree from the data of a tree.  In other words,
  fitting a tree does two things: It creates the structure of a tree and it
  creates a mapping of each leaf to a value.  Lookup therefore requires both
  finding the leaf node AND using the map to lookup the value.
  
### The loss function optimized by the tree is configurable, as is the leaf


## Terms:

### Tree
A Tree is an object that takes input data and determines what leaf it ends up in.  Unlike many tree implementations, the Tree itself doesn't store data about the value of a leaf.  That is stored externally.


### loss_fn
A loss_fn is a function that takes data rows, the predicted targets for those rows, and the actual targets for those rows, and returns a single value that determines the "LOSS" or "COST" of that prediction (lower cost/loss is better)

```
def loss_fn(predicted_targets, actual_targets) -> float
```

A loss function must be additive (so, one should not apply a mean as a part of it)

### leaf_prediction_fn
A leaf_prediction_fn is a function which takes the features and actual targets that end up in a leaf and returns a Series of the predictions for each row ending up in that leaf.  It is typically a constant function whose value is either the mean good rate in that leaf (among the actual targets) or the median target, but can be anything else

```
def leaf_prediction_fn(features) -> pd.Series
```


### leaf_prediction_builder
A leaf_prediction_builder is a function which takes the features and actual targets that end up in a TRANING leaf and returns a leaf_prediction_fn.  This leaf_prediction_fn is used to predict the value of testing rows that end up in the same leaf.

```
def leaf_prediction_builder(features, actual_targets) -> leaf_prediction_fn
```


### leaf_prediction_map
A leaf_prediction_map is a map of leaf ids (eg their hash) to the leaf_prediction_fn for that leaf.  One can only use a tree to score data if one has a leaf_prediction_map.  This design allows on to use the same tree as a subset of another tree without having their leaf values become entangled.

--------------

## Test Tree Manipulation Functions

In [None]:
%pdb off

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.LeafNode()

o = gtree.BranchNode('C', 0.1, None, None)
o.left = gtree.LeafNode()
o.right = gtree.LeafNode()

t.prn()
print '\n\n'
o.prn()

In [None]:
u = gtree.replace_branch_split(t, t.right, o)
u.prn()
print '\n\n'
t.prn()

In [None]:
v = gtree.replace_node(t, t.left, o)
v.prn()
print '\n\n'
t.prn()

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.BranchNode('C', 0.9, None, None)
t.right.right.right = gtree.LeafNode()
t.right.right.left = gtree.LeafNode()

t.prn()
print '\n\n'
gtree.prune(t, 2).prn()
print '\n\n'
t.prn()

In [None]:
data = pd.DataFrame({'A': [0.1, 10, .02],
                     'B': [10, 20, 30]},
                    index=['foo', 'bar', 'baz'])

In [None]:
class StaticLeaf(object):

    def __init__(self, val):
        self.val = val
        
    def predict(self, df):
        return np.array([self.val for _ in range(len(df))])

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode() #'A', 0.5, 10, 20)
t.right = gtree.LeafNode() #'A', 0.5, 100, 0)

leaf_map = {hash(t.left): StaticLeaf(10),
            hash(t.right): StaticLeaf(20)}

t.predict(data, leaf_map)

In [None]:
t

In [None]:
# Create a split on a DataFrame

df = pd.DataFrame({'foo': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])})

gtree._single_variable_best_split(
    df,
    'foo',
    pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1]),
    loss='error_rate',
    leaf_prediction='mean')

In [None]:
threshold = 0.5
truth     = pd.Series([1, 0, 1], dtype=np.float32)
predicted = pd.Series([0, 1, 1], dtype=np.float32)

print gtree.loss(truth, predicted, type='error_rate')

print 1.0 - ((predicted >= threshold) == truth).mean() #+ (predicted < threshold) * (1 - truth)

# Test Split Finding

In [None]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]}, dtype=np.float32)
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0], dtype=np.float32)

tree, leaf_map = gtree.train_greedy_tree(df, target, loss='error_rate')

print '\nTree:\n'
tree.prn()

print leaf_map

In [None]:
gtree.calculate_leaf_map(tree, df, target)

In [None]:
gtree.random_node(tree)

In [None]:
print gtree.get_all_nodes(tree)

In [None]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]}, dtype=np.float32)
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0], dtype=np.float32)

In [None]:
gtree._single_variable_best_split(df,'B', target,
                                  loss='error_rate',
                                  leaf_prediction='mean')

In [None]:
tree, leaf_map = gtree.train_greedy_tree(df, target,
                                         loss='error_rate',
                                         feature_sample_rate=.5,
                                         row_sample_rate=.5)

print '\nTree:\n'
tree.prn()

print leaf_map

In [None]:
gtree.mate(tree, tree).prn()

In [None]:
#def make_hastie_sample(n_samples):
#
#    features, targets = datasets.make_hastie_10_2(n_samples=n_samples)
#
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#    return features, targets



In [None]:
#def make_kddcup(n_samples):
#    
#    features, targets = datasets.fetch_kddcup99(subset='smtp')
#
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#    
#    features = featurse.sample(n=n_samples)
#    
#    return features, targets.loc[features.index]
    

#def make_random_classification(n_samples, n_features=100):
       
#    features, targets = datasets.make_classification(n_samples=n_samples,
#                                                     n_features=n_features,
#                                                     n_informative=8,
#                                                     n_classes=2,
#                                                     n_clusters_per_class=4)
#                                                     
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#        
#    return features, targets.loc[features.index]

# Start the Test Analysis Here

In [8]:
#features, targets = make_hastie_sample(10000)
features, targets = tools.make_random_classification(10000)
features = pd.DataFrame(features, dtype=np.float32)
targets = pd.Series(targets, dtype=np.float32)

In [9]:
features.shape

(10000, 100)

In [10]:
targets.value_counts()

1.0    5004
0.0    4996
Name: target, dtype: int64

In [12]:
gtree.tree_logger.setLevel(logging.INFO)
tree, leaf_map = gtree.train_greedy_tree(features, targets,
                                         loss='cross_entropy',
                                         leaf_prediction='mean',
                                         max_depth=7)

02:30:01 tree INFO:Training.  Depth 0 Current Loss: 5.7517 Best Split: feature_76 -3.8880 0.6675
02:30:02 tree INFO:Training.  Depth 1 Current Loss: 1.8419 Best Split: feature_24 -0.5131 0.3762
02:30:02 tree INFO:Training.  Depth 2 Current Loss: 0.6211 Best Split: feature_24 -2.3053 0.1860
02:30:03 tree INFO:Training.  Depth 3 Current Loss: 0.0000 Best Split: feature_0 -0.7205 0.0000
02:30:03 tree INFO:Training.  Depth 4 Current Loss: 0.0000 Best Split: feature_0 -3.7507 0.0000
02:30:03 tree INFO:No split improves loss.  Returning
02:30:03 tree INFO:Training.  Depth 4 Current Loss: 0.0000 Best Split: feature_0 -0.0234 0.0000
02:30:03 tree INFO:Training.  Depth 5 Current Loss: 0.0000 Best Split: feature_0 -0.7205 0.0000
02:30:03 tree INFO:No split improves loss.  Returning
02:30:03 tree INFO:Training.  Depth 5 Current Loss: 0.0000 Best Split: feature_0 0.5312 0.0000
02:30:03 tree INFO:Training.  Depth 6 Current Loss: 0.0000 Best Split: feature_0 -0.0234 0.0000
02:30:03 tree INFO:No spli

02:30:14 tree INFO:Training.  Depth 6 Current Loss: 5.7565 Best Split: feature_0 2.4503 0.0000
02:30:14 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:14 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:14 tree INFO:Training.  Depth 5 Current Loss: 1.1550 Best Split: feature_60 4.5801 0.2948
02:30:14 tree INFO:Training.  Depth 6 Current Loss: 0.9822 Best Split: feature_60 -3.0928 0.2569
02:30:14 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:14 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:15 tree INFO:Training.  Depth 6 Current Loss: 1.9188 Best Split: feature_2 -1.5974 0.0000
02:30:15 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:15 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:15 tree INFO:Training.  Depth 4 Current Loss: 4.9341 Best Split: feature_91 -0.5167 0.2681
02:30:15 tree INFO:Tr

02:30:32 tree INFO:Training.  Depth 5 Current Loss: 2.0145 Best Split: feature_43 -2.2428 0.4291
02:30:33 tree INFO:Training.  Depth 6 Current Loss: 5.5820 Best Split: feature_34 -0.2040 0.5618
02:30:33 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:33 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:33 tree INFO:Training.  Depth 6 Current Loss: 1.5795 Best Split: feature_60 1.5664 0.3643
02:30:33 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:30:33 tree INFO:Reached leaf node, or constraints force termination.  Returning


In [None]:
#gtree.cross_entropy_loss(targets[features.feature_45 < .2326])

In [None]:
#gtree._single_variable_best_split(features, 'feature_45', targets, None, None, None)

In [None]:
tree.prn()

In [None]:
set(pd.Series([1, 2, 3]))

In [None]:
tree.predict(features, leaf_map)

In [None]:
results = pd.DataFrame({'truth': targets, 'prediction': tree.predict(features, leaf_map)})

In [None]:
1.0 - gtree.error_rate_loss(results.prediction, results.truth) / len(targets)

In [None]:
results.plot(kind='scatter', x='prediction', y='truth')

In [None]:
fig = plt.figure(figsize=(12,8))

for label, grp in tree.predict(features, leaf_map).groupby(targets):
    grp.hist(normed=True, alpha=0.5, label=str(label)) #, label=label)
plt.legend(loc='best')
None

# Compare Methods

In [None]:
features, targets = tools.make_random_classification(5000)
features = pd.DataFrame(features, dtype=np.float32)
targets = pd.Series(targets, dtype=np.float32)

features_validation = features.sample(frac=.3)
targets_validation = targets.loc[features_validation.index]

features = features[~features.index.isin(features_validation.index)]
targets = targets.loc[features.index]

In [None]:
%pdb off

In [None]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, targets,
                                   loss='cross_entropy',
                                   max_depth=3, min_to_split=10,
                                   num_generations=15, num_survivors=10,
                                   num_children=200, num_seed_trees=5)


In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [None]:
generations[-1]['best_of_generation']['tree'].find_leaves(features).value_counts()

In [None]:
for gen in generations[-1]['generation']:
    print '--------------------{:.4f}----------------------------'.format(gen['loss_testing'])
    gen['tree'].prn()

In [None]:
for result in generations[-1]['generation']:
    print '---------------------------------------------'
    result['tree'].prn()

In [None]:
result = gtree.train_random_trees(features, targets, loss_fn=gtree.error_rate_loss,
                                  max_depth=2,
                                  min_to_split=10,
                                  num_trees=10)

In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split

clf = tree.DecisionTreeClassifier(max_depth=2)
clf = clf.fit(features, targets)

In [None]:
predictions = pd.Series(clf.predict_proba(features_validation)[:, 1], index=features_validation.index)
gtree.error_rate_loss(predictions, targets_validation)

In [None]:
from sklearn.externals.six import StringIO  
from sklearn import tree as sklearn_tree
import pydot 
dot_data = StringIO() 
sklearn_tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("iris.pdf") 

In [None]:
%alias_magic t timeit

In [None]:
sel = features[features['feature_3'] < 0].index

In [None]:
sel

In [None]:
%t features.loc[sel]

In [None]:
%t df.reindex_axis(sel, copy=False)

# Evolve

In [None]:
# BC Dataset

In [59]:
#bc_info = datasets.load_breast_cancer()

features, target = datasets.make_hastie_10_2(n_samples=5000)

features = pd.DataFrame(features, dtype=np.float32)
target = pd.Series([1.0 if t == 1.0 else 0.0 for t in target], dtype=np.float32).dropna()
features = features.loc[target.index]




In [60]:
target.value_counts(dropna=False)

0.0    2514
1.0    2486
dtype: int64

In [61]:
gtree.tree_logger.setLevel(logging.WARNING)

In [62]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, target,
                                   loss='cross_entropy',
                                   leaf_prediction='logit',
                                   max_depth=3,
                                   min_to_split=10,
                                   num_generations=5,
                                   num_survivors=20,
                                   num_children=50,
                                   num_seed_trees=10)

02:58:10 evolution DEBUG:Growing Seed: 1 of 10
02:58:10 evolution DEBUG:Growing Seed: 2 of 10
02:58:10 evolution DEBUG:Growing Seed: 3 of 10
02:58:11 evolution DEBUG:Growing Seed: 4 of 10
02:58:11 evolution DEBUG:Growing Seed: 5 of 10
02:58:11 evolution DEBUG:Growing Seed: 6 of 10
02:58:11 evolution DEBUG:Growing Seed: 7 of 10
02:58:12 evolution DEBUG:Growing Seed: 8 of 10
02:58:12 evolution DEBUG:Growing Seed: 9 of 10
02:58:12 evolution DEBUG:Growing Seed: 10 of 10
02:58:13 evolution DEBUG:Resplitting the data
02:58:13 evolution DEBUG:Mating to create 50 children
02:58:22 evolution DEBUG:Calculating loss functions for generation of size: 36
02:58:24 evolution DEBUG:Surviving Generation: 0:4.0982, 1:4.1596, 0:4.2670, 1:4.3285, 0:4.3361, 1:4.3515, 1:4.4282, 0:4.4435, 0:4.4513, 0:4.4589, 1:4.4589, 0:4.4666, 1:4.4666, 1:4.4896, 1:4.4896, 1:4.5127, 0:4.5203, 1:4.5280, 0:4.5587, 1:4.6047
02:58:24 evolution INFO:Generation 0 Training Loss: 4.2054 Hold Out Loss 4.0982

02:58:24 evolution DEBU

In [63]:
generations

[{'best_of_generation': {'gen': 0,
   'leaf_map': {1738089081: <tree._my_tree.LogitMapper at 0x10e2775d0>,
    2287517046: <tree._my_tree.LogitMapper at 0x10e277120>,
    2773039620: <tree._my_tree.LogitMapper at 0x10e277990>,
    2821744457: <tree._my_tree.LogitMapper at 0x10e2776c0>,
    3430426092: <tree._my_tree.LogitMapper at 0x10e2777b0>,
    3439248159: <tree._my_tree.LogitMapper at 0x10e277210>,
    3459063113: <tree._my_tree.LogitMapper at 0x10e277b70>},
   'loss_testing': 4.57402229309082,
   'loss_training': 4.69870138168335,
   'tree': <gtree.BranchNode at 0x10e27e810>},
  'generation': [{'gen': 0,
    'leaf_map': {1738089081: <tree._my_tree.LogitMapper at 0x10e2775d0>,
     2287517046: <tree._my_tree.LogitMapper at 0x10e277120>,
     2773039620: <tree._my_tree.LogitMapper at 0x10e277990>,
     2821744457: <tree._my_tree.LogitMapper at 0x10e2776c0>,
     3430426092: <tree._my_tree.LogitMapper at 0x10e2777b0>,
     3439248159: <tree._my_tree.LogitMapper at 0x10e277210>,
    

In [64]:
for gen in generations:
    best = gen['best_of_generation']
    print '=========================={:.4f} {:.4f}==============================\n'.format(
        best['loss_training'],
        best['loss_testing'])
    best['tree'].prn()


		<Leaf 3430426092>
	4 -0.90825
			<Leaf 1738089081>
		4 0.38764
			<Leaf 2773039620>
3 -1.46905
			<Leaf 3439248159>
		2 1.15880
			<Leaf 2821744457>
	3 1.30867
			<Leaf 3459063113>
		0 0.96905
			<Leaf 2287517046>

			<Leaf 1739404036>
		7 -0.24046
			<Leaf 3742812772>
	4 -1.77875
			<Leaf 1921311805>
		3 -1.68084
			<Leaf 2433203905>
4 0.97654
			<Leaf 2365720942>
		4 1.74027
			<Leaf 2939507302>
	4 -0.04171
			<Leaf 2820950154>
		0 -0.77602
			<Leaf 2811073649>

			<Leaf 2990669967>
		7 -0.17674
			<Leaf 2908757922>
	4 -1.61420
			<Leaf 2561010635>
		4 1.39760
			<Leaf 2542490382>
1 1.34348
			<Leaf 2909369628>
		3 0.33564
			<Leaf 1965902654>
	1 2.02730
			<Leaf 3401701804>
		9 0.29779
			<Leaf 2294396454>

			<Leaf 1232120670>
		5 0.11775
			<Leaf 2317052497>
	4 -1.61420
			<Leaf 3459679990>
		4 1.39760
			<Leaf 1467840732>
1 1.34348
			<Leaf 3084038657>
		2 -1.13800
			<Leaf 3001725372>
	1 2.02730
			<Leaf 2618126670>
		9 0.29779
			<Leaf 2588573605>

			<Leaf 1794428896>
		7 -

In [65]:
generations[-1]['best_of_generation']

{'gen': 3,
 'leaf_map': {1794428896: <tree._my_tree.LogitMapper at 0x10e6e87b0>,
  2446483188: <tree._my_tree.LogitMapper at 0x10e6e8b70>,
  2774717364: <tree._my_tree.LogitMapper at 0x10e6e86c0>,
  3266520419: <tree._my_tree.LogitMapper at 0x10e6e8e40>,
  3268543359: <tree._my_tree.LogitMapper at 0x10e6a93f0>,
  3350802060: <tree._my_tree.LogitMapper at 0x10e6a9b70>,
  3917495607: <tree._my_tree.LogitMapper at 0x10e6a94e0>,
  3920658799: <tree._my_tree.LogitMapper at 0x10e538300>},
 'loss_testing': 4.02145528793335,
 'loss_training': 4.153654098510742,
 'tree': <gtree.BranchNode at 0x10e69a6d0>}

In [66]:
for k, v in generations[-1]['best_of_generation']['leaf_map'].iteritems():
    print k, v.get_coeficients(), '\n'

1794428896 [-0.11867277  0.          0.02908762  0.          0.         -0.34675154
 -0.45530525  0.          0.         -0.74472415] 

3266520419 [-0.05395517  0.07222711  0.1059664  -0.04423313  0.12772825 -0.07168749
  0.27522105 -0.318299   -0.15059599  0.        ] 

3350802060 [ 0.          0.11188052  0.03955221  0.          0.09886718  0.21424007
 -0.04052415  0.1149186  -0.00326681  0.        ] 

3920658799 [-0.14436561  0.52755606 -0.02544092 -0.04711518 -0.17030811  0.          0.
  0.24464481  0.10601188  0.        ] 

2774717364 [ 0.0612771  -0.0879372   0.01279095  0.09197631 -0.04828111 -0.01100349
 -0.01746801 -0.06290676 -0.0111341  -0.07132311] 

3917495607 [ 0.46795651 -0.17166984 -0.10907427 -0.71777743 -0.43023306  0.
  0.01775977  0.33616525  0.26464593  0.        ] 

2446483188 [ 0.0188942   0.14658841  0.         -0.05905033  0.         -0.33989283
 -0.00114687  0.20198871  0.45611215  0.        ] 

3268543359 [ 0.         -0.47460949  0.          0.          0. 

In [67]:
generations[-1]['best_of_generation']['leaf_map']

{1794428896: <tree._my_tree.LogitMapper at 0x10e6e87b0>,
 2446483188: <tree._my_tree.LogitMapper at 0x10e6e8b70>,
 2774717364: <tree._my_tree.LogitMapper at 0x10e6e86c0>,
 3266520419: <tree._my_tree.LogitMapper at 0x10e6e8e40>,
 3268543359: <tree._my_tree.LogitMapper at 0x10e6a93f0>,
 3350802060: <tree._my_tree.LogitMapper at 0x10e6a9b70>,
 3917495607: <tree._my_tree.LogitMapper at 0x10e6a94e0>,
 3920658799: <tree._my_tree.LogitMapper at 0x10e538300>}

In [None]:
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
X

In [None]:
fns = {0: lambda x: (x[:,0] + x[:,1] + x[:,2]).reshape(len(x), 1), 1: lambda x: (-1*x[:,0]).reshape(len(x), 1)}

hashes = np.array([[0], [1], [0]])
hashes

In [None]:
predictions = np.zeros((len(X), 1))
predictions

In [None]:
zero = np.zeros((len(X), 1))

for i in [0, 1]:
    #comparison = np.full((len(X), 1), i)
    predictions[hashes.reshape(len(X))==i] = fns[i](X[hashes.reshape(len(X))==i, :]) # += np.where(hashes==i, fns[i](X), zero)
    
predictions

In [None]:
X[hashes==comparison, :]

In [None]:
X[:,0].reshape(3, 1)

In [None]:
X[hashes==np.array([1]).reshape((len(X), 1))]

In [None]:
hashes==1

In [None]:
X[np.array([True, False, True]), :]

In [None]:
import numpy as np
import statsmodels.discrete.discrete_model as sm
import statsmodels.tools.tools as sm_tools

X = np.array([[1, 2,  3],
              [2, 7,  5],
              [3, 10, 7],
              [5, 18, 10],
              [-10, 70, 3]             
             ], dtype=np.float64)

y = np.array([[1], [0], [1], [0], [1]], dtype=np.float64)

logit = sm.Logit(y, sm_tools.add_constant(X))
fit = logit.fit_regularized(method='l1', alpha=1.0)
fit.params

In [None]:
logit.predict(fit.params, sm_tools.add_constant(X))

In [None]:
from sklearn.svm.base import _fit_liblinear

coef_, intercept_, n_iter = _fit_liblinear(
                X, np.ravel(y), C=1.0, fit_intercept=True, intercept_scaling=1.0,
                class_weight=None, penalty='l1', dual=False, verbose=True,
                max_iter=5000, tol=1e-4, random_state=None,
                sample_weight=None)
#n_iter = np.array([n_iter])

(coef_, intercept_, n_iter)

In [None]:
from scipy.special import expit
expit(coef_.dot(X.T) + intercept_)