# Logistic Regression
(C) 2018 Dariusz Kajtoch

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
data = pd.read_csv('./predict_blood_donations.csv')
print(data.columns)
data.columns = ['Id', 'MonthLast', 'Num', 'Vol', 'MonthFirst', 'Predict']
data.head()

Index(['Unnamed: 0', 'Months since Last Donation', 'Number of Donations',
       'Total Volume Donated (c.c.)', 'Months since First Donation',
       'Made Donation in March 2007'],
      dtype='object')


Unnamed: 0,Id,MonthLast,Num,Vol,MonthFirst,Predict
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


# Features

In [207]:
X = data[[
    'MonthLast',
    'Num',
    'MonthFirst'
]]
#X['Ratio'] = X['MonthLast']/X['MonthFirst']
#X['Ratio'] = np.log(X['MonthLast']/X['MonthFirst']+1.)
X['log'] = np.log(X['MonthFirst']-X['MonthLast']+1)

y = data['Predict']

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class Features(TransformerMixin):
    
    def __init__(self):
        self.names = ['MonthLast', 'Num', 'MonthFirst']
        self.sc = StandardScaler()
    
    def transform(self, X, **kwargs):
        ret = self.sc.transform(X[self.names])
        ret = np.c_[ret, X['Ratio'].tolist()]
        return ret
    
    def fit(self, X, y=None, **kwargs):
        self.sc.fit(X[self.names],y)
        return self

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [219]:
from sklearn.model_selection import StratifiedKFold, LeaveOneOut
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

import sys

lr = LogisticRegression(penalty='l2', C=1.2, random_state=56)

clf = lr
#clf = Pipeline([('scaler', StandardScaler()), ('estimator', lr)])

cv_results = cross_validate(clf, X, y,
    scoring = {
        'Loss': 'neg_log_loss',
        'Accuracy': 'accuracy'     
    },
    cv=StratifiedKFold(n_splits=50, random_state=1234),
    return_train_score=True
)

print(cv_results.keys())
    
print(np.mean(cv_results['train_Loss']))
print(np.mean(cv_results['test_Loss']))

dict_keys(['fit_time', 'score_time', 'test_Loss', 'train_Loss', 'test_Accuracy', 'train_Accuracy'])
-0.47640207493153314
-0.4888495200895626


## Bagging Logistic Regression

In [220]:
from sklearn.ensemble import BaggingClassifier

bag = BaggingClassifier(base_estimator=clf,
    n_estimators=30,
    max_samples=1.,
    bootstrap=True,
    bootstrap_features=True,
    random_state=0         
)

cv_results = cross_validate(bag, X, y,
    scoring = {
        'Loss': 'neg_log_loss',
        'Accuracy': 'accuracy'     
    },
    cv=StratifiedKFold(n_splits=50, random_state=1234),
    return_train_score=True
)

print(cv_results.keys())
    
print(np.mean(cv_results['train_Loss']))
print(np.mean(cv_results['test_Loss']))

dict_keys(['fit_time', 'score_time', 'test_Loss', 'train_Loss', 'test_Accuracy', 'train_Accuracy'])
-0.48230365498367184
-0.4876255533195104


In [225]:
bag.fit(X,y)

BaggingClassifier(base_estimator=LogisticRegression(C=1.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=56, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=True, max_features=1.0,
         max_samples=1.0, n_estimators=30, n_jobs=1, oob_score=False,
         random_state=0, verbose=0, warm_start=False)

In [75]:
print('AUC: %.8f +/- %.8f, min: %.8f' % (np.mean(roc_tab), np.std(roc_tab), np.min(roc_tab)))
print('LogLoss: %.8f +/- %.8f, max: %.8f' % (np.mean(loss_tab), np.std(loss_tab), np.max(loss_tab)))
print('Accuracy: %.8f +/- %.8f, min: %.8f' % (np.mean(acc_tab), np.std(acc_tab), np.min(acc_tab)))

AUC: 0.85925630 +/- 0.12342818, min: 0.63149351
LogLoss: 0.50950758 +/- 0.08219407, max: 0.66872279
Accuracy: 0.75874384 +/- 0.01287167, min: 0.72413793


In [222]:
print(loss_tab)

[0.6216937831432269, 0.5234546505046235, 0.45513301924698313, 0.43391712269238075, 0.42941885194888385, 0.4988078412942319, 0.9042610764010716, 0.47436105265399414, 0.3912385601401405, 0.5116973617798835]


In [226]:
from collections import OrderedDict

test = pd.read_csv('./test.csv')
print(test.columns.tolist())
X_test = test[[
    'Months since Last Donation',
    'Number of Donations',
    'Months since First Donation'
]]
X_test.columns = ['MonthLast', 'Num', 'MonthFirst']
X_test['log'] = np.log(X_test['MonthFirst']-X_test['MonthLast']+1.)
proba = bag.predict_proba(X_test)[:,1]*1.

submission = pd.DataFrame(
    OrderedDict([('',test['Unnamed: 0']), ('Made Donation in March 2007',proba)])
)

submission.to_csv('./submission.csv', index=False)

['Unnamed: 0', 'Months since Last Donation', 'Number of Donations', 'Total Volume Donated (c.c.)', 'Months since First Donation']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
