# Logistic Regression
(C) 2018 Dariusz Kajtoch

In [80]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [81]:
data = pd.read_csv('./predict_blood_donations.csv')
print(data.columns)
data.columns = ['Id', 'MonthLast', 'Num', 'Vol', 'MonthFirst', 'Predict']
data.head()

Index(['Unnamed: 0', 'Months since Last Donation', 'Number of Donations',
       'Total Volume Donated (c.c.)', 'Months since First Donation',
       'Made Donation in March 2007'],
      dtype='object')


Unnamed: 0,Id,MonthLast,Num,Vol,MonthFirst,Predict
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


# Features

In [153]:
X = data[[
    'MonthLast',
    'Num',
    'MonthFirst'
]]
X['Ratio'] = X['MonthLast']/X['MonthFirst']
#X['log'] = np.log(X['MonthFirst']-X['MonthLast']+1)

y = data['Predict']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [154]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty='l2', C=0.0006, random_state=56)                  

skf = StratifiedKFold(n_splits=10, random_state=1234)


roc_tab = []; loss_tab = []; acc_tab = []
for train_index, test_index in skf.split(X,y):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test   = X.iloc[test_index], y.iloc[test_index]
    
    clf.fit(X_train, y_train)
    
    proba = clf.predict_proba(X_test)*1.
    y_pred = clf.predict(X_test)
    
    roc_tab.append(roc_auc_score(y_test, proba[:,1]))
    loss_tab.append(log_loss(y_test, proba[:,1]))
    acc_tab.append(accuracy_score(y_test, y_pred))

In [152]:
print('AUC: %.8f +/- %.8f, min: %.8f' % (np.mean(roc_tab), np.std(roc_tab), np.min(roc_tab)))
print('LogLoss: %.8f +/- %.8f, max: %.8f' % (np.mean(loss_tab), np.std(loss_tab), np.max(loss_tab)))
print('Accuracy: %.8f +/- %.8f, min: %.8f' % (np.mean(acc_tab), np.std(acc_tab), np.min(acc_tab)))

AUC: 0.85940208 +/- 0.12347590, min: 0.63149351
LogLoss: 0.50998236 +/- 0.08287920, max: 0.67011319
Accuracy: 0.76046798 +/- 0.01585756, min: 0.72413793


In [139]:
from collections import OrderedDict

test = pd.read_csv('./test.csv')
print(test.columns.tolist())
X_test = test[[
    'Months since Last Donation',
    'Number of Donations',
    'Months since First Donation'
]]
X_test.columns = ['MonthLast', 'Num', 'MonthFirst']
X_test['Ratio'] = X_test['MonthLast']/X_test['MonthFirst']
proba = clf.predict_proba(X_test)[:,1]

submission = pd.DataFrame(
    OrderedDict([('',test['Unnamed: 0']), ('Made Donation in March 2007',proba)])
)

submission.to_csv('./submission.csv', index=False)

['Unnamed: 0', 'Months since Last Donation', 'Number of Donations', 'Total Volume Donated (c.c.)', 'Months since First Donation']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
